def __init__(self, z_score=True, alpha=None): """ Simulator constructor :param z_score: If TRUE, normalize data with z-score, ELSE use min-max scaler """ self._scaler = StandardScaler() if z_score else MinMaxScaler() self.cwd = os.path.dirname(os.path.realpath(__file__)) self.ema = load_ema(alpha=alpha) self.alpha = alpha self.db = Database(sym='None', exchange='None', record_data=False)
def __init__(self, ccy: str, exchange: str): """ OrderBook constructor. :param ccy: currency symbol :param exchange: 'coinbase' or 'bitfinex' """ self.sym = ccy self.db = Database(sym=ccy, exchange=exchange) self.db.init_db_connection() self.bids = get_orderbook(name=exchange)(sym=ccy, side='bids') self.asks = get_orderbook(name=exchange)(sym=ccy, side='asks') self.exchange = exchange self.midpoint = float() self.spread = float() self.buy_tracker = TradeTracker() self.sell_tracker = TradeTracker() self.last_tick_time = None
def __init__(self, sym: str, exchange: str): """ OrderBook constructor. :param sym: instrument name :param exchange: 'coinbase' or 'bitfinex' or 'bitmex' """ self.sym = sym self.db = Database(sym=sym, exchange=exchange) self.db.init_db_connection() self.bids = BOOK_BY_EXCHANGE[exchange](sym=sym, side='bids') self.asks = BOOK_BY_EXCHANGE[exchange](sym=sym, side='asks') self.exchange = exchange self.midpoint = float() self.spread = float() self.buy_tracker = TradeTracker() self.sell_tracker = TradeTracker() self.last_tick_time = None
def __init__(self, ccy, exchange): self.sym = ccy self.db = Database(ccy, exchange) self.bids = CoinbaseBook(ccy, 'bids') if exchange == 'coinbase' else \ BitfinexBook(ccy, 'bids') self.asks = CoinbaseBook(ccy, 'asks') if exchange == 'coinbase' else \ BitfinexBook(ccy, 'asks') self.midpoint = float() self.trade_tracker = dict({'buys': float(0), 'sells': float(0)})
def __init__(self, ccy, exchange): """ OrderBook constructor :param ccy: currency symbol :param exchange: 'coinbase' or 'bitfinex' """ self.sym = ccy self.db = Database(sym=ccy, exchange=exchange) self.db.init_db_connection() self.bids = CoinbaseBook( ccy, 'bids') if exchange == 'coinbase' else BitfinexBook( ccy, 'bids') self.asks = CoinbaseBook( ccy, 'asks') if exchange == 'coinbase' else BitfinexBook( ccy, 'asks') self.midpoint = float() self.buy_tracker = TradeTracker() self.sell_tracker = TradeTracker() self.last_tick_time = None
def __init__(self): """ Simulator constructor. """ self.cwd = os.path.dirname(os.path.realpath(__file__)) self.db = Database(sym='None', exchange='None', record_data=False)
class Simulator(object): def __init__(self): """ Simulator constructor. """ self.cwd = os.path.dirname(os.path.realpath(__file__)) self.db = Database(sym='None', exchange='None', record_data=False) def __str__(self): return 'Simulator: [ db={} ]'.format(self.db) @staticmethod def export_to_csv(data: pd.DataFrame, filename: str = 'BTC-USD_2019-01-01', compress: bool = True) -> None: """ Export data within a Panda DataFrame to a csv. :param data: (panda.DataFrame) historical tick data :param filename: CCY_YYYY-MM-DD :param compress: Default True. If True, compress with xz """ start_time = dt.now(tz=TIMEZONE) sub_folder = os.path.join(DATA_PATH, filename) + '.csv' if compress: sub_folder += '.xz' data.to_csv(path_or_buf=sub_folder, index=False, compression='xz') else: data.to_csv(path_or_buf=sub_folder, index=False) elapsed = (dt.now(tz=TIMEZONE) - start_time).seconds LOGGER.info('Exported %s with %i rows in %i seconds' % (sub_folder, data.shape[0], elapsed)) @staticmethod def get_ema_labels(features_list: list, ema_list: list, include_system_time: bool): """ Get a list of column labels for EMA values in a list. """ assert isinstance(ema_list, list) is True, \ "Error: EMA_LIST must be a list data type, not {}".format(type(ema_list)) ema_labels = list() for ema in ema_list: for col in features_list: if col == 'system_time': continue ema_labels.append('{}_{}'.format(col, ema)) if include_system_time: ema_labels.insert(0, 'system_time') return ema_labels @staticmethod def _get_microsecond_delta(new_tick_time: dt, last_snapshot_time: dt) -> int: """ Calculate difference between two consecutive ticks. Note: only tracks timedelta for up to a minute. :param new_tick_time: datetime of incoming tick :param last_snapshot_time: datetime of last LOB snapshot :return: (int) delta between ticks """ if last_snapshot_time > new_tick_time: return -1 snapshot_tick_time_delta = new_tick_time - last_snapshot_time seconds = snapshot_tick_time_delta.seconds * 1000000 microseconds = snapshot_tick_time_delta.microseconds return seconds + microseconds def get_orderbook_snapshot_history(self, query: dict) -> pd.DataFrame or None: """ Function to replay historical market data and generate the features used for reinforcement learning & training. NOTE: The query can either be a single Coinbase CCY, or both Coinbase and Bitfinex, but it cannot be only a Bitfinex CCY. Later releases of this repo will support Bitfinex only order book reconstruction. :param query: (dict) query for finding tick history in Arctic TickStore :return: (pd.DataFrame) snapshots of limit order books using a stationary feature set """ self.db.init_db_connection() tick_history = self.db.get_tick_history(query=query) if tick_history is None: LOGGER.warn("Query returned no data: {}".format(query)) return None loop_length = tick_history.shape[0] # number of microseconds between LOB snapshots snapshot_interval_milliseconds = SNAPSHOT_RATE_IN_MICROSECONDS // 1000 snapshot_list = list() last_snapshot_time = None tick_types_for_warm_up = {'load_book', 'book_loaded', 'preload'} instrument_name = query['ccy'][0] assert isinstance(instrument_name, str), \ "Error: instrument_name must be a string, not -> {}".format( type(instrument_name)) LOGGER.info('querying {}'.format(instrument_name)) order_book = get_orderbook_from_symbol(symbol=instrument_name)( sym=instrument_name) start_time = dt.now(TIMEZONE) LOGGER.info( 'Starting get_orderbook_snapshot_history() loop with %i ticks for %s' % (loop_length, query['ccy'])) # loop through all ticks returned from the Arctic Tick Store query. for count, tx in enumerate(tick_history.itertuples()): # periodically print number of steps completed if count % 250000 == 0: elapsed = (dt.now(TIMEZONE) - start_time).seconds LOGGER.info('...completed %i loops in %i seconds' % (count, elapsed)) # convert to dictionary for processing tick = tx._asdict() # filter out bad ticks if 'type' not in tick: continue # flags for a order book reset if tick['type'] in tick_types_for_warm_up: order_book.new_tick(msg=tick) continue # check if the LOB is pre-loaded, if not skip message and do NOT process. if order_book.done_warming_up is False: LOGGER.info("{} order book is not done warming up: {}".format( instrument_name, tick)) continue # timestamp for incoming tick new_tick_time = parse(tick.get('system_time')) # remove ticks without timestamps (should not exist/happen) if new_tick_time is None: LOGGER.info('No tick time: {}'.format(tick)) continue # initialize the LOB snapshot timer if last_snapshot_time is None: # process first ticks and check if they're stale ticks; if so, # skip to the next loop. order_book.new_tick(tick) last_tick_time = order_book.last_tick_time if last_tick_time is None: continue last_tick_time_dt = parse(last_tick_time) last_snapshot_time = last_tick_time_dt LOGGER.info('{} first tick: {} '.format( order_book.sym, new_tick_time)) # skip to next loop continue # calculate the amount of time between the incoming # tick and tick received before that diff = self._get_microsecond_delta(new_tick_time, last_snapshot_time) # update the LOB, but do not take a LOB snapshot if the tick time is # out of sequence. This occurs when pre-loading a LOB with stale tick # times in general. if diff == -1: order_book.new_tick(msg=tick) continue # derive the number of LOB snapshot insertions for the data buffer. multiple = diff // SNAPSHOT_RATE_IN_MICROSECONDS # 1000000 is 1 second # proceed if we have one or more insertions to make if multiple <= 0: order_book.new_tick(msg=tick) continue order_book_snapshot = order_book.render_book() for i in range(multiple): last_snapshot_time += timedelta( milliseconds=snapshot_interval_milliseconds) snapshot_list.append( np.hstack((last_snapshot_time, order_book_snapshot))) # update order book with most recent tick now, so the snapshots # are up to date for the next iteration of the loop. order_book.new_tick(msg=tick) continue elapsed = max((dt.now(TIMEZONE) - start_time).seconds, 1) LOGGER.info('Completed run_simulation() with %i ticks in %i seconds ' 'at %i ticks/second' % (loop_length, elapsed, loop_length // elapsed)) orderbook_snapshot_history = pd.DataFrame( data=snapshot_list, columns=['system_time'] + order_book.render_lob_feature_names()) # remove NAs from data set (and print the amount) before_shape = orderbook_snapshot_history.shape[0] orderbook_snapshot_history = orderbook_snapshot_history.dropna(axis=0) difference_in_records = orderbook_snapshot_history.shape[ 0] - before_shape LOGGER.info("{} {} rows due to NA values".format( 'Dropping' if difference_in_records <= 0 else 'Adding', abs(difference_in_records))) return orderbook_snapshot_history def extract_features(self, query: dict) -> None: """ Create and export limit order book data to csv. This function exports multiple days of data and ensures each day starts and ends exactly on time. :param query: (dict) ccy=sym, daterange=(YYYYMMDD,YYYYMMDD) :return: void """ start_time = dt.now(tz=TIMEZONE) order_book_data = self.get_orderbook_snapshot_history(query=query) if order_book_data is not None: dates = order_book_data['system_time'].dt.date.unique() LOGGER.info('dates: {}'.format(dates)) for date in dates[:]: tmp = order_book_data.loc[ order_book_data['system_time'].dt.date == date] self.export_to_csv(tmp, filename='{}_{}'.format( query['ccy'][0], date), compress=True) elapsed = (dt.now(tz=TIMEZONE) - start_time).seconds LOGGER.info( '***\nSimulator.extract_features() executed in %i seconds\n***' % elapsed)
class OrderBook(ABC): def __init__(self, ccy, exchange): """ OrderBook constructor :param ccy: currency symbol :param exchange: 'coinbase' or 'bitfinex' """ self.sym = ccy self.db = Database(sym=ccy, exchange=exchange) self.db.init_db_connection() self.bids = CoinbaseBook( ccy, 'bids') if exchange == 'coinbase' else BitfinexBook( ccy, 'bids') self.asks = CoinbaseBook( ccy, 'asks') if exchange == 'coinbase' else BitfinexBook( ccy, 'asks') self.midpoint = float() self.buy_tracker = TradeTracker() self.sell_tracker = TradeTracker() self.last_tick_time = None def __str__(self): return '%s || %s' % (self.bids, self.asks) @abstractmethod def new_tick(self, msg): """ Event handler for incoming tick messages :param msg: incoming order or trade message :return: """ pass def clear_trade_trackers(self): """ Reset buy and sell trade trackers; used between LOB snapshots :return: (void) """ self.buy_tracker.clear() self.sell_tracker.clear() def clear_book(self): """ Method to reset the limit order book :return: (void) """ self.bids.clear() self.asks.clear() self.last_tick_time = None print('--Cleared %s order book--' % self.sym) # def render_book(self): # """ # Convert the limit order book into a DataFrame # :return: pandas dataframe # """ # # pd_bids = self.bids.get_bids_to_list() # pd_asks = self.asks.get_asks_to_list() # # return pd.concat([pd_bids, pd_asks], sort=False) def render_book(self): """ Create stationary feature set for limit order book Inspired by: https://arxiv.org/abs/1810.09965v1 :return: numpy array """ bid_price, bid_level = self.bids.get_bid() ask_price, ask_level = self.asks.get_ask() self.midpoint = (bid_price + ask_price) / 2.0 bid_data = self.bids.get_bids_to_list(midpoint=self.midpoint) ask_data = self.asks.get_asks_to_list(midpoint=self.midpoint) buy_trades = np.array(self.buy_tracker.notional) sell_trades = np.array(self.sell_tracker.notional) self.clear_trade_trackers() if INCLUDE_ORDERFLOW: bid_distances, bid_notionals, bid_cancel_notionals, bid_limit_notionals, \ bid_market_notionals = bid_data ask_distances, ask_notionals, ask_cancel_notionals, ask_limit_notionals, \ ask_market_notionals = ask_data return np.hstack((bid_notionals, ask_notionals, bid_distances, ask_distances, buy_trades, sell_trades, bid_cancel_notionals, ask_cancel_notionals, bid_limit_notionals, ask_limit_notionals, bid_market_notionals, ask_market_notionals)) else: bid_distances, bid_notionals = bid_data ask_distances, ask_notionals = ask_data return np.hstack((bid_notionals, ask_notionals, bid_distances, ask_distances, buy_trades, sell_trades)) # def render_book(self): # """ # Create stationary feature set for limit order book # # Source: https://arxiv.org/abs/1810.09965v1 # # :return: numpy array # """ # bid_price, bid_level = self.bids.get_bid() # ask_price, ask_level = self.asks.get_ask() # # self.midpoint = (bid_price + ask_price) / 2.0 # # bids = self.bids.get_bids_to_list(self.midpoint) # asks = self.asks.get_asks_to_list(self.midpoint) # # buy_trades = np.array(self.buy_tracker.notional) # sell_trades = np.array(self.sell_tracker.notional) # # self.clear_trade_trackers() # # return np.hstack((bids, asks, buy_trades, sell_trades)) @property def best_bid(self): """ Get the best bid :return: float best bid """ return self.bids.get_bid() @property def best_ask(self): """ Get the best ask :return: float best ask """ return self.asks.get_ask() def done_warming_up(self): """ Flag to indicate if the entire Limit Order Book has been loaded :return: True if loaded / False if still waiting to download """ return ~self.bids.warming_up & ~self.asks.warming_up
class OrderBook(ABC): def __init__(self, ccy: str, exchange: str): """ OrderBook constructor. :param ccy: currency symbol :param exchange: 'coinbase' or 'bitfinex' """ self.sym = ccy self.db = Database(sym=ccy, exchange=exchange) self.db.init_db_connection() self.bids = get_orderbook(name=exchange)(sym=ccy, side='bids') self.asks = get_orderbook(name=exchange)(sym=ccy, side='asks') self.exchange = exchange self.midpoint = float() self.spread = float() self.buy_tracker = TradeTracker() self.sell_tracker = TradeTracker() self.last_tick_time = None def __str__(self): return '%s || %s' % (self.bids, self.asks) @abstractmethod def new_tick(self, msg: dict) -> bool: """ Event handler for incoming tick messages. :param msg: incoming order or trade message :return: FALSE if reconnection to WebSocket is needed, else TRUE if good """ return True def clear_trade_trackers(self) -> None: """ Reset buy and sell trade trackers; used between LOB snapshots. :return: (void) """ self.buy_tracker.clear() self.sell_tracker.clear() def clear_book(self) -> None: """ Method to reset the limit order book. :return: (void) """ self.bids.clear() # warming_up flag reset in `Position` class self.asks.clear() # warming_up flag reset in `Position` class self.last_tick_time = None LOGGER.info("{}'s order book cleared.".format(self.sym)) def render_book(self) -> np.ndarray: """ Create stationary feature set for limit order book. :return: LOB feature set """ # get price levels of LOB bid_price, bid_level = self.bids.get_bid() ask_price, ask_level = self.asks.get_ask() # derive midpoint price and spread from bid and ask data self.midpoint = (ask_price + bid_price) / 2.0 self.spread = round(ask_price - bid_price, 4) # round to clean float rounding # transform raw LOB data into stationary feature set bid_data = self.bids.get_bids_to_list(midpoint=self.midpoint) ask_data = self.asks.get_asks_to_list(midpoint=self.midpoint) # convert buy and sell trade notional values to an array buy_trades = np.array(self.buy_tracker.notional) sell_trades = np.array(self.sell_tracker.notional) # reset trackers after each LOB render self.clear_trade_trackers() return np.hstack((self.midpoint, self.spread, buy_trades, sell_trades, *bid_data, *ask_data)) @staticmethod def render_lob_feature_names(include_orderflow: bool = INCLUDE_ORDERFLOW) -> list: """ Get the column names for the LOB render features. :param include_orderflow: if TRUE, order flow imbalance stats are included in set :return: list containing features names """ feature_names = list() feature_names.append('midpoint') feature_names.append('spread') feature_names.append('buys') feature_names.append('sells') feature_types = ['distance', 'notional'] if include_orderflow: feature_types += ['cancel_notional', 'limit_notional', 'market_notional'] for side in ['bid', 'ask']: for feature in feature_types: for row in range(MAX_BOOK_ROWS): feature_names.append("{}_{}_{}".format(side, feature, row)) LOGGER.info("render_feature_names() has {} features".format(len(feature_names))) return feature_names @property def best_bid(self) -> float: """ Get the best bid. :return: float best bid """ return self.bids.get_bid() @property def best_ask(self) -> float: """ Get the best ask. :return: float best ask """ return self.asks.get_ask() @property def done_warming_up(self) -> bool: """ Flag to indicate if the entire Limit Order Book has been loaded. :return: True if loaded / False if still waiting to download """ return self.bids.warming_up is False & self.asks.warming_up is False
class Simulator(object): def __init__(self, z_score=True, alpha=None): """ Simulator constructor :param z_score: If TRUE, normalize data with z-score, ELSE use min-max scaler """ self._scaler = StandardScaler() if z_score else MinMaxScaler() self.cwd = os.path.dirname(os.path.realpath(__file__)) self.ema = load_ema(alpha=alpha) self.alpha = alpha self.db = Database(sym='None', exchange='None', record_data=False) def __str__(self): return 'Simulator: [ scaler={} | ema={} ]'.format( self._scaler.__class__, self.ema) @staticmethod def get_feature_labels(include_system_time: bool = True, include_bitfinex: bool = True, include_order_flow: bool = INCLUDE_ORDERFLOW, include_imbalances: bool = True, include_spread: bool = False, include_ema=None): """ Function to create the features' labels :param include_bitfinex: (boolean) If TRUE, Bitfinex's LOB data is included in the dataset, in addition to Coinbase-Pro :param include_system_time: True/False (False removes the system_time column) :param include_order_flow: True/False if TRUE, order arrival metrics are included in the feature set :param include_imbalances: True/False if TRUE, order volume imbalances at level are included in the feature set :param include_spread: True/False if TRUE, order spread column is included :param include_ema: None, float, or list if list, then append alphas to each column :return: """ columns = list() if include_system_time: columns.append('system_time') columns.append('coinbase_midpoint') exchanges = ['coinbase'] if include_bitfinex: columns.append('midpoint_delta') exchanges.append('bitfinex') for exchange in exchanges: for feature in ['notional', 'distance']: for side in ['bid', 'ask']: if side == 'bid': for level in reversed(range(MAX_BOOK_ROWS)): columns.append(('%s_%s_%s_%i' % (exchange, side, feature, level))) else: for level in range(MAX_BOOK_ROWS): columns.append(('%s_%s_%s_%i' % (exchange, side, feature, level))) for trade_side in ['buys', 'sells']: columns.append('%s_%s' % (exchange, trade_side)) if include_order_flow: for feature in [ 'cancel_notional', 'limit_notional', 'market_notional' ]: for side in ['bid', 'ask']: if side == 'bid': for level in reversed(range(MAX_BOOK_ROWS)): columns.append( ('%s_%s_%s_%i' % (exchange, side, feature, level))) else: for level in range(MAX_BOOK_ROWS): columns.append( ('%s_%s_%s_%i' % (exchange, side, feature, level))) if include_spread: columns.append('{}_spread'.format(exchange)) if include_imbalances: for level in range(MAX_BOOK_ROWS): columns.append('notional_imbalance_{}'.format(level)) columns.append('notional_imbalance_mean') columns.append('notional_imbalance_std') if isinstance(include_ema, list): tmp = list() for ema in include_ema: for col in columns: if col == 'system_time': continue tmp.append('{}_{}'.format(col, ema)) if include_system_time: tmp.insert(0, 'system_time') columns = tmp return columns def export_to_csv(self, data: pd.DataFrame, filename='BTC-USD_2019-01-01', compress=True): """ Export data within a Panda dataframe to a csv :param data: (panda.DataFrame) historical tick data :param filename: CCY_YYYY-MM-DD :param compress: Default True. If True, compress with xz :return: void """ start_time = dt.now(tz=TIMEZONE) sub_folder = os.path.join(self.cwd, 'data_exports', filename) + '.csv' if compress: sub_folder += '.xz' data.to_csv(path_or_buf=sub_folder, index=False, compression='xz') else: data.to_csv(path_or_buf=sub_folder, index=False) elapsed = (dt.now(tz=TIMEZONE) - start_time).seconds print('Exported %s with %i rows in %i seconds' % (sub_folder, data.shape[0], elapsed)) @staticmethod def import_csv(filename: str) -> pd.DataFrame: """ Import an historical tick file created from the export_to_csv() function :param filename: Full file path including filename :return: (panda.DataFrame) historical limit order book data """ start_time = dt.now(tz=TIMEZONE) if 'xz' in filename: data = pd.read_csv(filepath_or_buffer=filename, index_col=0, compression='xz', engine='c') elif 'csv' in filename: data = pd.read_csv(filepath_or_buffer=filename, index_col=0, engine='c') else: print('Error: file must be a csv or xz') data = None elapsed = (dt.now(tz=TIMEZONE) - start_time).seconds print('Imported %s from a csv in %i seconds' % (filename[-21:], elapsed)) return data def fit_scaler(self, orderbook_snapshot_history: pd.DataFrame): """ Scale limit order book data for the neural network :param orderbook_snapshot_history: Limit order book data from the previous day :return: (void) """ self._scaler.fit(orderbook_snapshot_history) def scale_data(self, data: pd.DataFrame): """ Normalize data :param data: (np.array) all data in environment :return: (np.array) normalized observation space """ return self._scaler.transform(data) @staticmethod def _midpoint_diff(data: pd.DataFrame): """ Take log difference of midpoint prices log(price t) - log(price t-1) :param data: (pd.DataFrame) raw data from LOB snapshots :return: (pd.DataFrame) with midpoint prices normalized """ data['coinbase_midpoint'] = np.log(data['coinbase_midpoint'].values) data['coinbase_midpoint'] = ( data['coinbase_midpoint'] - data['coinbase_midpoint'].shift(1)).fillna(method='bfill') return data @staticmethod def _spread_calc(data: pd.DataFrame) -> pd.DataFrame: """ Derive the spread and normalize it by a multiple of the market order fee. :param data: (pd.DataFrame) data set containing a bid and ask :return: data with spread added as the last column """ # calculate the spread in real terms ('+' because bid_distances are all negative) data['coinbase_spread'] = data['coinbase_ask_distance_0'].values + \ data['coinbase_bid_distance_0'].values return data @staticmethod def _get_order_imbalance(data: pd.DataFrame): """ Calculate order imbalances per price level, their mean & standard deviation. Order Imbalances are calculated by: = (bid_quantity - ask_quantity) / (bid_quantity + ask_quantity) ...thus scale from [-1, 1]. :param data: raw/unnormalized LOB snapshot data :return: (pd.DataFrame) order imbalances at N-levels, the mean & std imbalance """ # create the column names for making a data frame (also used for debugging) bid_notional_columns, ask_notional_columns, imbalance_columns = [], [], [] for i in range(MAX_BOOK_ROWS): bid_notional_columns.append('coinbase_bid_notional_{}'.format(i)) ask_notional_columns.append('coinbase_ask_notional_{}'.format(i)) imbalance_columns.append('notional_imbalance_{}'.format(i)) # acquire bid and ask notional data bid_notional = data[ bid_notional_columns].values[::-1] # reverse the bids to # ascending order, so that they align with the asks ask_notional = data[ask_notional_columns].values # calculate the order imbalance imbalances = (bid_notional - ask_notional) / (bid_notional + ask_notional) imbalances = pd.DataFrame(imbalances, columns=imbalance_columns).fillna(0.) # add meta data to features (mean and std) imbalances['notional_imbalance_mean'] = imbalances[ imbalance_columns].mean(axis=1) imbalances['notional_imbalance_std'] = imbalances[ imbalance_columns].std(axis=1) return imbalances def load_environment_data(self, fitting_file: str, testing_file: str, include_imbalances: bool = True, as_pandas: bool = False): """ Import and scale environment data set with prior day's data. Midpoint gets log-normalized: log(price t) - log(price t-1) :param fitting_file: prior trading day :param testing_file: current trading day :param include_imbalances: if TRUE, include LOB imbalances :param as_pandas: if TRUE, return data as DataFrame, otherwise np.array :return: (pd.DataFrame or np.array) scaled environment data """ # import data used to fit scaler fitting_data_filepath = os.path.join(self.cwd, 'data_exports', fitting_file) fitting_data = self.import_csv(filename=fitting_data_filepath) # check if bitfinex data is in the data set include_bitfinex = 'bitfinex' in fitting_data.columns.tolist() # carry on with data import process fitting_data = self._midpoint_diff( data=fitting_data) # normalize midpoint fitting_data = self._spread_calc(data=fitting_data) # normalize spread fitting_data = apply_ema_all_data(ema=self.ema, data=fitting_data) self.fit_scaler(fitting_data) del fitting_data # import data to normalize and use in environment data_used_in_environment = os.path.join(self.cwd, 'data_exports', testing_file) data = self.import_csv(filename=data_used_in_environment) midpoint_prices = data['coinbase_midpoint'] normalized_data = self._midpoint_diff(data.copy(deep=True)) normalized_data = self._spread_calc( data=normalized_data) # normalize spread normalized_data = apply_ema_all_data(ema=self.ema, data=normalized_data) normalized_data = self.scale_data(normalized_data) normalized_data = np.clip(normalized_data, -10, 10) normalized_data = pd.DataFrame(normalized_data, columns=self.get_feature_labels( include_system_time=False, include_bitfinex=include_bitfinex, include_spread=True, include_imbalances=False, include_ema=self.alpha)) if include_imbalances: print('Adding order imbalances...') # Note: since order imbalance data is scaled [-1, 1], we do not apply # z-score to the imbalance data imbalance_data = self._get_order_imbalance(data=data) imbalance_data = apply_ema_all_data(ema=reset_ema(self.ema), data=imbalance_data) normalized_data = pd.concat((normalized_data, imbalance_data), axis=1) if as_pandas is False: midpoint_prices = midpoint_prices.values data = data.values normalized_data = normalized_data.values return midpoint_prices, data, normalized_data @staticmethod def _get_microsecond_delta(new_tick_time: dt, last_snapshot_time: dt): """ Calculate difference between two consecutive ticks. Note: only tracks timedelta for up to a minute. :param new_tick_time: datetime of incoming tick :param last_snapshot_time: datetime of last LOB snapshot :return: (int) delta between ticks """ if last_snapshot_time > new_tick_time: return -1 snapshot_tick_time_delta = new_tick_time - last_snapshot_time seconds = snapshot_tick_time_delta.seconds * 1000000 microseconds = snapshot_tick_time_delta.microseconds # print("seconds={} | microseconds={}".format(seconds, microseconds)) return seconds + microseconds def get_orderbook_snapshot_history(self, query: dict): """ Function to replay historical market data and generate the features used for reinforcement learning & training. NOTE: The query can either be a single Coinbase CCY, or both Coinbase and Bitfinex, but it cannot be only a Biftinex CCY. Later releases of this repo will support Bitfinex only orderbook reconstruction. :param query: (dict) query for finding tick history in Arctic TickStore :return: (pd.DataFrame) snapshots of limit order books using a stationary feature set """ self.db.init_db_connection() tick_history = self.db.get_tick_history(query=query) if tick_history is None: print("Query returned no data: {}".format(query)) return None loop_length = tick_history.shape[0] # number of microseconds between LOB snapshots snapshot_interval_milliseconds = SNAPSHOT_RATE_IN_MICROSECONDS // 1000 snapshot_list = list() last_snapshot_time = None symbols = query['ccy'] print('querying {}'.format(symbols)) include_bitfinex = len(symbols) > 1 if include_bitfinex: print('\n\nIncluding Bitfinex data in feature set.\n\n') coinbase_order_book = CoinbaseOrderBook(symbols[0]) bitfinex_order_book = BitfinexOrderBook(symbols[1]) if include_bitfinex \ else None start_time = dt.now(TIMEZONE) print( 'Starting get_orderbook_snapshot_history() loop with %i ticks for %s' % (loop_length, query['ccy'])) # loop through all ticks returned from the Arctic Tick Store query. for count, tx in enumerate(tick_history.itertuples()): # periodically print number of steps completed if count % 250000 == 0: elapsed = (dt.now(TIMEZONE) - start_time).seconds print('...completed %i loops in %i seconds' % (count, elapsed)) # convert to dictionary for processing tick = tx._asdict() # determine if incoming tick is from coinbase or bitfinex coinbase = True if tick['product_id'] == coinbase_order_book.sym else \ False # filter out bad ticks if 'type' not in tick: continue # flags for a order book reset if tick['type'] in ['load_book', 'book_loaded', 'preload']: if coinbase: coinbase_order_book.new_tick(tick) else: bitfinex_order_book.new_tick(tick) # skip to next loop continue # incoming tick is for coinbase LOB if coinbase: # check if the LOB is pre-loaded, if not skip message and do NOT process. if coinbase_order_book.done_warming_up() is False: print("coinbase_order_book not done warming up: {}".format( tick)) continue # timestamp for incoming tick new_tick_time = parse(tick.get('time')) # remove ticks without timestamps (should not exist/happen) if new_tick_time is None: print('No tick time: {}'.format(tick)) continue # initialize the LOB snapshot timer if last_snapshot_time is None: # process first ticks and check if they're stale ticks; if so, # skip to the next loop. coinbase_order_book.new_tick(tick) last_coinbase_tick_time = coinbase_order_book.last_tick_time if last_coinbase_tick_time is None: continue last_coinbase_tick_time_dt = parse(last_coinbase_tick_time) last_snapshot_time = last_coinbase_tick_time_dt print('{} first tick: {} | Sequence: {}'.format( coinbase_order_book.sym, new_tick_time, coinbase_order_book.sequence)) # skip to next loop continue # calculate the amount of time between the incoming # tick and tick received before that diff = self._get_microsecond_delta(new_tick_time, last_snapshot_time) # update the LOB, but do not take a LOB snapshot if the tick time is # out of sequence. This occurs when pre-loading a LOB with stale tick # times in general. if diff == -1: coinbase_order_book.new_tick(tick) continue # derive the number of LOB snapshot insertions for the data buffer. multiple = diff // SNAPSHOT_RATE_IN_MICROSECONDS # 1000000 is 1 second # proceed if we have one or more insertions to make if multiple <= 0: coinbase_order_book.new_tick(tick) continue # check to include Bitfinex data in features. if include_bitfinex: # if bitfinex's LOB is still loading, do NOT export snapshots # of coinbase in the meantime and continue to next loop. if bitfinex_order_book.done_warming_up() is False: print("bitfinex_order_book not done warming up: {}". format(tick)) coinbase_order_book.new_tick(tick) # update the LOB snapshot tracker. for _ in range(multiple): last_snapshot_time += timedelta( milliseconds=snapshot_interval_milliseconds) # move to next loop and see if bitfinex's LOB is ready then. continue # since both coinbase and bitfinex LOBs are assumed to be # pre-loaded at this point, we can proceed to export snapshots # of the LOB, even if there has been a 'long' duration between # consecutive ticks. coinbase_order_book_snapshot = coinbase_order_book.render_book( ) bitfinex_order_book_snapshot = bitfinex_order_book.render_book( ) midpoint_delta = coinbase_order_book.midpoint - \ bitfinex_order_book.midpoint # update the LOB snapshot time-delta AND add LOB snapshots to the # data buffer. for i in range(multiple): last_snapshot_time += timedelta( milliseconds=snapshot_interval_milliseconds) snapshot_list.append( np.hstack(( last_snapshot_time, coinbase_order_book.midpoint, # midpoint price midpoint_delta, # price delta between exchanges coinbase_order_book_snapshot, bitfinex_order_book_snapshot))) # longs/shorts # update order book with most recent tick now, so the snapshots # are up to date for the next iteration of the loop. coinbase_order_book.new_tick(tick) continue else: # do not include bitfinex coinbase_order_book_snapshot = coinbase_order_book.render_book( ) for i in range(multiple): last_snapshot_time += timedelta( milliseconds=snapshot_interval_milliseconds) snapshot_list.append( np.hstack((last_snapshot_time, coinbase_order_book.midpoint, coinbase_order_book_snapshot))) # update order book with most recent tick now, so the snapshots # are up to date for the next iteration of the loop. coinbase_order_book.new_tick(tick) continue # incoming tick is from Bitfinex exchange elif include_bitfinex and bitfinex_order_book.done_warming_up(): bitfinex_order_book.new_tick(tick) continue elapsed = (dt.now(TIMEZONE) - start_time).seconds print('Completed run_simulation() with %i ticks in %i seconds ' 'at %i ticks/second' % (loop_length, elapsed, loop_length // elapsed)) orderbook_snapshot_history = pd.DataFrame( snapshot_list, columns=self.get_feature_labels( include_system_time=True, include_spread=False, include_bitfinex=include_bitfinex, include_order_flow=INCLUDE_ORDERFLOW, include_imbalances=False, include_ema=self.alpha)) orderbook_snapshot_history = orderbook_snapshot_history.dropna(axis=0) return orderbook_snapshot_history def extract_features(self, query: dict): """ Create and export limit order book data to csv. This function exports multiple days of data and ensures each day starts and ends exactly on time. :param query: (dict) ccy=sym, daterange=(YYYYMMDD,YYYYMMDD) :return: void """ start_time = dt.now(tz=TIMEZONE) order_book_data = self.get_orderbook_snapshot_history(query=query) if order_book_data is not None: dates = order_book_data['system_time'].dt.date.unique() print('dates: {}'.format(dates)) for date in dates[1:]: tmp = order_book_data.loc[ order_book_data['system_time'].dt.date == date] self.export_to_csv(tmp, filename='{}_{}'.format( query['ccy'][0], date), compress=True) elapsed = (dt.now(tz=TIMEZONE) - start_time).seconds print('***\nSimulator.extract_features() executed in %i seconds\n***' % elapsed)