Пример #1
0
 def __init__(self, z_score=True, alpha=None):
     """
     Simulator constructor
     :param z_score: If TRUE, normalize data with z-score,
                     ELSE use min-max scaler
     """
     self._scaler = StandardScaler() if z_score else MinMaxScaler()
     self.cwd = os.path.dirname(os.path.realpath(__file__))
     self.ema = load_ema(alpha=alpha)
     self.alpha = alpha
     self.db = Database(sym='None', exchange='None', record_data=False)
Пример #2
0
    def __init__(self, ccy: str, exchange: str):
        """
        OrderBook constructor.

        :param ccy: currency symbol
        :param exchange: 'coinbase' or 'bitfinex'
        """
        self.sym = ccy
        self.db = Database(sym=ccy, exchange=exchange)
        self.db.init_db_connection()
        self.bids = get_orderbook(name=exchange)(sym=ccy, side='bids')
        self.asks = get_orderbook(name=exchange)(sym=ccy, side='asks')
        self.exchange = exchange
        self.midpoint = float()
        self.spread = float()
        self.buy_tracker = TradeTracker()
        self.sell_tracker = TradeTracker()
        self.last_tick_time = None
Пример #3
0
    def __init__(self, sym: str, exchange: str):
        """
        OrderBook constructor.

        :param sym: instrument name
        :param exchange: 'coinbase' or 'bitfinex' or 'bitmex'
        """
        self.sym = sym
        self.db = Database(sym=sym, exchange=exchange)
        self.db.init_db_connection()
        self.bids = BOOK_BY_EXCHANGE[exchange](sym=sym, side='bids')
        self.asks = BOOK_BY_EXCHANGE[exchange](sym=sym, side='asks')
        self.exchange = exchange
        self.midpoint = float()
        self.spread = float()
        self.buy_tracker = TradeTracker()
        self.sell_tracker = TradeTracker()
        self.last_tick_time = None
Пример #4
0
 def __init__(self, ccy, exchange):
     self.sym = ccy
     self.db = Database(ccy, exchange)
     self.bids = CoinbaseBook(ccy, 'bids') if exchange == 'coinbase' else \
         BitfinexBook(ccy, 'bids')
     self.asks = CoinbaseBook(ccy, 'asks') if exchange == 'coinbase' else \
         BitfinexBook(ccy, 'asks')
     self.midpoint = float()
     self.trade_tracker = dict({'buys': float(0), 'sells': float(0)})
Пример #5
0
    def __init__(self, ccy, exchange):
        """
        OrderBook constructor

        :param ccy: currency symbol
        :param exchange: 'coinbase' or 'bitfinex'
        """
        self.sym = ccy
        self.db = Database(sym=ccy, exchange=exchange)
        self.db.init_db_connection()
        self.bids = CoinbaseBook(
            ccy, 'bids') if exchange == 'coinbase' else BitfinexBook(
                ccy, 'bids')
        self.asks = CoinbaseBook(
            ccy, 'asks') if exchange == 'coinbase' else BitfinexBook(
                ccy, 'asks')
        self.midpoint = float()
        self.buy_tracker = TradeTracker()
        self.sell_tracker = TradeTracker()
        self.last_tick_time = None
Пример #6
0
 def __init__(self):
     """
     Simulator constructor.
     """
     self.cwd = os.path.dirname(os.path.realpath(__file__))
     self.db = Database(sym='None', exchange='None', record_data=False)
Пример #7
0
class Simulator(object):
    def __init__(self):
        """
        Simulator constructor.
        """
        self.cwd = os.path.dirname(os.path.realpath(__file__))
        self.db = Database(sym='None', exchange='None', record_data=False)

    def __str__(self):
        return 'Simulator: [ db={} ]'.format(self.db)

    @staticmethod
    def export_to_csv(data: pd.DataFrame,
                      filename: str = 'BTC-USD_2019-01-01',
                      compress: bool = True) -> None:
        """
        Export data within a Panda DataFrame to a csv.

        :param data: (panda.DataFrame) historical tick data
        :param filename: CCY_YYYY-MM-DD
        :param compress: Default True. If True, compress with xz
        """
        start_time = dt.now(tz=TIMEZONE)

        sub_folder = os.path.join(DATA_PATH, filename) + '.csv'

        if compress:
            sub_folder += '.xz'
            data.to_csv(path_or_buf=sub_folder, index=False, compression='xz')
        else:
            data.to_csv(path_or_buf=sub_folder, index=False)

        elapsed = (dt.now(tz=TIMEZONE) - start_time).seconds
        LOGGER.info('Exported %s with %i rows in %i seconds' %
                    (sub_folder, data.shape[0], elapsed))

    @staticmethod
    def get_ema_labels(features_list: list, ema_list: list,
                       include_system_time: bool):
        """
        Get a list of column labels for EMA values in a list.
        """
        assert isinstance(ema_list, list) is True, \
            "Error: EMA_LIST must be a list data type, not {}".format(type(ema_list))

        ema_labels = list()

        for ema in ema_list:
            for col in features_list:
                if col == 'system_time':
                    continue
                ema_labels.append('{}_{}'.format(col, ema))

        if include_system_time:
            ema_labels.insert(0, 'system_time')

        return ema_labels

    @staticmethod
    def _get_microsecond_delta(new_tick_time: dt,
                               last_snapshot_time: dt) -> int:
        """
        Calculate difference between two consecutive ticks.

        Note: only tracks timedelta for up to a minute.

        :param new_tick_time: datetime of incoming tick
        :param last_snapshot_time: datetime of last LOB snapshot
        :return: (int) delta between ticks
        """

        if last_snapshot_time > new_tick_time:
            return -1

        snapshot_tick_time_delta = new_tick_time - last_snapshot_time
        seconds = snapshot_tick_time_delta.seconds * 1000000
        microseconds = snapshot_tick_time_delta.microseconds

        return seconds + microseconds

    def get_orderbook_snapshot_history(self,
                                       query: dict) -> pd.DataFrame or None:
        """
        Function to replay historical market data and generate the features used for
        reinforcement learning & training.

        NOTE:
            The query can either be a single Coinbase CCY, or both Coinbase and Bitfinex,
            but it cannot be only a Bitfinex CCY. Later releases of this repo will
            support Bitfinex only order book reconstruction.

        :param query: (dict) query for finding tick history in Arctic TickStore
        :return: (pd.DataFrame) snapshots of limit order books using a
                stationary feature set
        """
        self.db.init_db_connection()

        tick_history = self.db.get_tick_history(query=query)
        if tick_history is None:
            LOGGER.warn("Query returned no data: {}".format(query))
            return None

        loop_length = tick_history.shape[0]

        # number of microseconds between LOB snapshots
        snapshot_interval_milliseconds = SNAPSHOT_RATE_IN_MICROSECONDS // 1000

        snapshot_list = list()
        last_snapshot_time = None
        tick_types_for_warm_up = {'load_book', 'book_loaded', 'preload'}

        instrument_name = query['ccy'][0]
        assert isinstance(instrument_name, str), \
            "Error: instrument_name must be a string, not -> {}".format(
                type(instrument_name))

        LOGGER.info('querying {}'.format(instrument_name))

        order_book = get_orderbook_from_symbol(symbol=instrument_name)(
            sym=instrument_name)

        start_time = dt.now(TIMEZONE)
        LOGGER.info(
            'Starting get_orderbook_snapshot_history() loop with %i ticks for %s'
            % (loop_length, query['ccy']))

        # loop through all ticks returned from the Arctic Tick Store query.
        for count, tx in enumerate(tick_history.itertuples()):

            # periodically print number of steps completed
            if count % 250000 == 0:
                elapsed = (dt.now(TIMEZONE) - start_time).seconds
                LOGGER.info('...completed %i loops in %i seconds' %
                            (count, elapsed))

            # convert to dictionary for processing
            tick = tx._asdict()

            # filter out bad ticks
            if 'type' not in tick:
                continue

            # flags for a order book reset
            if tick['type'] in tick_types_for_warm_up:
                order_book.new_tick(msg=tick)
                continue

            # check if the LOB is pre-loaded, if not skip message and do NOT process.
            if order_book.done_warming_up is False:
                LOGGER.info("{} order book is not done warming up: {}".format(
                    instrument_name, tick))
                continue

            # timestamp for incoming tick
            new_tick_time = parse(tick.get('system_time'))

            # remove ticks without timestamps (should not exist/happen)
            if new_tick_time is None:
                LOGGER.info('No tick time: {}'.format(tick))
                continue

            # initialize the LOB snapshot timer
            if last_snapshot_time is None:
                # process first ticks and check if they're stale ticks; if so,
                # skip to the next loop.
                order_book.new_tick(tick)

                last_tick_time = order_book.last_tick_time
                if last_tick_time is None:
                    continue

                last_tick_time_dt = parse(last_tick_time)
                last_snapshot_time = last_tick_time_dt
                LOGGER.info('{} first tick: {} '.format(
                    order_book.sym, new_tick_time))
                # skip to next loop
                continue

            # calculate the amount of time between the incoming
            #   tick and tick received before that
            diff = self._get_microsecond_delta(new_tick_time,
                                               last_snapshot_time)

            # update the LOB, but do not take a LOB snapshot if the tick time is
            # out of sequence. This occurs when pre-loading a LOB with stale tick
            # times in general.
            if diff == -1:
                order_book.new_tick(msg=tick)
                continue

            # derive the number of LOB snapshot insertions for the data buffer.
            multiple = diff // SNAPSHOT_RATE_IN_MICROSECONDS  # 1000000 is 1 second

            # proceed if we have one or more insertions to make
            if multiple <= 0:
                order_book.new_tick(msg=tick)
                continue

            order_book_snapshot = order_book.render_book()
            for i in range(multiple):
                last_snapshot_time += timedelta(
                    milliseconds=snapshot_interval_milliseconds)
                snapshot_list.append(
                    np.hstack((last_snapshot_time, order_book_snapshot)))

            # update order book with most recent tick now, so the snapshots
            # are up to date for the next iteration of the loop.
            order_book.new_tick(msg=tick)
            continue

        elapsed = max((dt.now(TIMEZONE) - start_time).seconds, 1)
        LOGGER.info('Completed run_simulation() with %i ticks in %i seconds '
                    'at %i ticks/second' %
                    (loop_length, elapsed, loop_length // elapsed))

        orderbook_snapshot_history = pd.DataFrame(
            data=snapshot_list,
            columns=['system_time'] + order_book.render_lob_feature_names())

        # remove NAs from data set (and print the amount)
        before_shape = orderbook_snapshot_history.shape[0]
        orderbook_snapshot_history = orderbook_snapshot_history.dropna(axis=0)
        difference_in_records = orderbook_snapshot_history.shape[
            0] - before_shape
        LOGGER.info("{} {} rows due to NA values".format(
            'Dropping' if difference_in_records <= 0 else 'Adding',
            abs(difference_in_records)))

        return orderbook_snapshot_history

    def extract_features(self, query: dict) -> None:
        """
        Create and export limit order book data to csv. This function
        exports multiple days of data and ensures each day starts and
        ends exactly on time.

        :param query: (dict) ccy=sym, daterange=(YYYYMMDD,YYYYMMDD)
        :return: void
        """
        start_time = dt.now(tz=TIMEZONE)

        order_book_data = self.get_orderbook_snapshot_history(query=query)
        if order_book_data is not None:
            dates = order_book_data['system_time'].dt.date.unique()
            LOGGER.info('dates: {}'.format(dates))
            for date in dates[:]:
                tmp = order_book_data.loc[
                    order_book_data['system_time'].dt.date == date]
                self.export_to_csv(tmp,
                                   filename='{}_{}'.format(
                                       query['ccy'][0], date),
                                   compress=True)

        elapsed = (dt.now(tz=TIMEZONE) - start_time).seconds
        LOGGER.info(
            '***\nSimulator.extract_features() executed in %i seconds\n***' %
            elapsed)
Пример #8
0
class OrderBook(ABC):
    def __init__(self, ccy, exchange):
        """
        OrderBook constructor

        :param ccy: currency symbol
        :param exchange: 'coinbase' or 'bitfinex'
        """
        self.sym = ccy
        self.db = Database(sym=ccy, exchange=exchange)
        self.db.init_db_connection()
        self.bids = CoinbaseBook(
            ccy, 'bids') if exchange == 'coinbase' else BitfinexBook(
                ccy, 'bids')
        self.asks = CoinbaseBook(
            ccy, 'asks') if exchange == 'coinbase' else BitfinexBook(
                ccy, 'asks')
        self.midpoint = float()
        self.buy_tracker = TradeTracker()
        self.sell_tracker = TradeTracker()
        self.last_tick_time = None

    def __str__(self):
        return '%s  ||  %s' % (self.bids, self.asks)

    @abstractmethod
    def new_tick(self, msg):
        """
        Event handler for incoming tick messages

        :param msg: incoming order or trade message
        :return:
        """
        pass

    def clear_trade_trackers(self):
        """
        Reset buy and sell trade trackers; used between LOB snapshots

        :return: (void)
        """
        self.buy_tracker.clear()
        self.sell_tracker.clear()

    def clear_book(self):
        """
        Method to reset the limit order book

        :return: (void)
        """
        self.bids.clear()
        self.asks.clear()
        self.last_tick_time = None
        print('--Cleared %s order book--' % self.sym)

    # def render_book(self):
    #     """
    #     Convert the limit order book into a DataFrame
    #     :return: pandas dataframe
    #     """
    #
    #     pd_bids = self.bids.get_bids_to_list()
    #     pd_asks = self.asks.get_asks_to_list()
    #
    #     return pd.concat([pd_bids, pd_asks], sort=False)

    def render_book(self):
        """
        Create stationary feature set for limit order book

        Inspired by: https://arxiv.org/abs/1810.09965v1

        :return: numpy array
        """
        bid_price, bid_level = self.bids.get_bid()
        ask_price, ask_level = self.asks.get_ask()

        self.midpoint = (bid_price + ask_price) / 2.0

        bid_data = self.bids.get_bids_to_list(midpoint=self.midpoint)
        ask_data = self.asks.get_asks_to_list(midpoint=self.midpoint)

        buy_trades = np.array(self.buy_tracker.notional)
        sell_trades = np.array(self.sell_tracker.notional)
        self.clear_trade_trackers()

        if INCLUDE_ORDERFLOW:
            bid_distances, bid_notionals, bid_cancel_notionals, bid_limit_notionals, \
                bid_market_notionals = bid_data

            ask_distances, ask_notionals, ask_cancel_notionals, ask_limit_notionals, \
                ask_market_notionals = ask_data

            return np.hstack((bid_notionals, ask_notionals, bid_distances,
                              ask_distances, buy_trades, sell_trades,
                              bid_cancel_notionals, ask_cancel_notionals,
                              bid_limit_notionals, ask_limit_notionals,
                              bid_market_notionals, ask_market_notionals))
        else:
            bid_distances, bid_notionals = bid_data
            ask_distances, ask_notionals = ask_data

            return np.hstack((bid_notionals, ask_notionals, bid_distances,
                              ask_distances, buy_trades, sell_trades))

    # def render_book(self):
    #     """
    #     Create stationary feature set for limit order book
    #
    #     Source: https://arxiv.org/abs/1810.09965v1
    #
    #     :return: numpy array
    #     """
    #     bid_price, bid_level = self.bids.get_bid()
    #     ask_price, ask_level = self.asks.get_ask()
    #
    #     self.midpoint = (bid_price + ask_price) / 2.0
    #
    #     bids = self.bids.get_bids_to_list(self.midpoint)
    #     asks = self.asks.get_asks_to_list(self.midpoint)
    #
    #     buy_trades = np.array(self.buy_tracker.notional)
    #     sell_trades = np.array(self.sell_tracker.notional)
    #
    #     self.clear_trade_trackers()
    #
    #     return np.hstack((bids, asks, buy_trades, sell_trades))

    @property
    def best_bid(self):
        """
        Get the best bid
        :return: float best bid
        """
        return self.bids.get_bid()

    @property
    def best_ask(self):
        """
        Get the best ask
        :return: float best ask
        """
        return self.asks.get_ask()

    def done_warming_up(self):
        """
        Flag to indicate if the entire Limit Order Book has been loaded
        :return: True if loaded / False if still waiting to download
        """
        return ~self.bids.warming_up & ~self.asks.warming_up
Пример #9
0
class OrderBook(ABC):

    def __init__(self, ccy: str, exchange: str):
        """
        OrderBook constructor.

        :param ccy: currency symbol
        :param exchange: 'coinbase' or 'bitfinex'
        """
        self.sym = ccy
        self.db = Database(sym=ccy, exchange=exchange)
        self.db.init_db_connection()
        self.bids = get_orderbook(name=exchange)(sym=ccy, side='bids')
        self.asks = get_orderbook(name=exchange)(sym=ccy, side='asks')
        self.exchange = exchange
        self.midpoint = float()
        self.spread = float()
        self.buy_tracker = TradeTracker()
        self.sell_tracker = TradeTracker()
        self.last_tick_time = None

    def __str__(self):
        return '%s  ||  %s' % (self.bids, self.asks)

    @abstractmethod
    def new_tick(self, msg: dict) -> bool:
        """
        Event handler for incoming tick messages.

        :param msg: incoming order or trade message
        :return: FALSE if reconnection to WebSocket is needed, else TRUE if good
        """
        return True

    def clear_trade_trackers(self) -> None:
        """
        Reset buy and sell trade trackers; used between LOB snapshots.

        :return: (void)
        """
        self.buy_tracker.clear()
        self.sell_tracker.clear()

    def clear_book(self) -> None:
        """
        Method to reset the limit order book.

        :return: (void)
        """
        self.bids.clear()  # warming_up flag reset in `Position` class
        self.asks.clear()  # warming_up flag reset in `Position` class
        self.last_tick_time = None
        LOGGER.info("{}'s order book cleared.".format(self.sym))

    def render_book(self) -> np.ndarray:
        """
        Create stationary feature set for limit order book.

        :return: LOB feature set
        """
        # get price levels of LOB
        bid_price, bid_level = self.bids.get_bid()
        ask_price, ask_level = self.asks.get_ask()

        # derive midpoint price and spread from bid and ask data
        self.midpoint = (ask_price + bid_price) / 2.0
        self.spread = round(ask_price - bid_price, 4)  # round to clean float rounding

        # transform raw LOB data into stationary feature set
        bid_data = self.bids.get_bids_to_list(midpoint=self.midpoint)
        ask_data = self.asks.get_asks_to_list(midpoint=self.midpoint)

        # convert buy and sell trade notional values to an array
        buy_trades = np.array(self.buy_tracker.notional)
        sell_trades = np.array(self.sell_tracker.notional)

        # reset trackers after each LOB render
        self.clear_trade_trackers()

        return np.hstack((self.midpoint, self.spread,
                          buy_trades, sell_trades,
                          *bid_data, *ask_data))

    @staticmethod
    def render_lob_feature_names(include_orderflow: bool = INCLUDE_ORDERFLOW) -> list:
        """
        Get the column names for the LOB render features.

        :param include_orderflow: if TRUE, order flow imbalance stats are included in set
        :return: list containing features names
        """
        feature_names = list()

        feature_names.append('midpoint')
        feature_names.append('spread')
        feature_names.append('buys')
        feature_names.append('sells')

        feature_types = ['distance', 'notional']
        if include_orderflow:
            feature_types += ['cancel_notional', 'limit_notional', 'market_notional']

        for side in ['bid', 'ask']:
            for feature in feature_types:
                for row in range(MAX_BOOK_ROWS):
                    feature_names.append("{}_{}_{}".format(side, feature, row))

        LOGGER.info("render_feature_names() has {} features".format(len(feature_names)))

        return feature_names

    @property
    def best_bid(self) -> float:
        """
        Get the best bid.

        :return: float best bid
        """
        return self.bids.get_bid()

    @property
    def best_ask(self) -> float:
        """
        Get the best ask.

        :return: float best ask
        """
        return self.asks.get_ask()

    @property
    def done_warming_up(self) -> bool:
        """
        Flag to indicate if the entire Limit Order Book has been loaded.

        :return: True if loaded / False if still waiting to download
        """
        return self.bids.warming_up is False & self.asks.warming_up is False
Пример #10
0
class Simulator(object):
    def __init__(self, z_score=True, alpha=None):
        """
        Simulator constructor
        :param z_score: If TRUE, normalize data with z-score,
                        ELSE use min-max scaler
        """
        self._scaler = StandardScaler() if z_score else MinMaxScaler()
        self.cwd = os.path.dirname(os.path.realpath(__file__))
        self.ema = load_ema(alpha=alpha)
        self.alpha = alpha
        self.db = Database(sym='None', exchange='None', record_data=False)

    def __str__(self):
        return 'Simulator: [ scaler={} | ema={} ]'.format(
            self._scaler.__class__, self.ema)

    @staticmethod
    def get_feature_labels(include_system_time: bool = True,
                           include_bitfinex: bool = True,
                           include_order_flow: bool = INCLUDE_ORDERFLOW,
                           include_imbalances: bool = True,
                           include_spread: bool = False,
                           include_ema=None):
        """
        Function to create the features' labels
        :param include_bitfinex: (boolean) If TRUE, Bitfinex's LOB data
                is included in the dataset, in addition to Coinbase-Pro
        :param include_system_time: True/False
                (False removes the system_time column)
        :param include_order_flow: True/False
                if TRUE, order arrival metrics are included in the feature set
        :param include_imbalances: True/False
                if TRUE, order volume imbalances at level are included in the feature set
        :param include_spread: True/False
                if TRUE, order spread column is included
        :param include_ema: None, float, or list
                if list, then append alphas to each column
        :return:
        """
        columns = list()

        if include_system_time:
            columns.append('system_time')

        columns.append('coinbase_midpoint')

        exchanges = ['coinbase']
        if include_bitfinex:
            columns.append('midpoint_delta')
            exchanges.append('bitfinex')

        for exchange in exchanges:
            for feature in ['notional', 'distance']:
                for side in ['bid', 'ask']:
                    if side == 'bid':
                        for level in reversed(range(MAX_BOOK_ROWS)):
                            columns.append(('%s_%s_%s_%i' %
                                            (exchange, side, feature, level)))
                    else:
                        for level in range(MAX_BOOK_ROWS):
                            columns.append(('%s_%s_%s_%i' %
                                            (exchange, side, feature, level)))

            for trade_side in ['buys', 'sells']:
                columns.append('%s_%s' % (exchange, trade_side))

            if include_order_flow:
                for feature in [
                        'cancel_notional', 'limit_notional', 'market_notional'
                ]:
                    for side in ['bid', 'ask']:
                        if side == 'bid':
                            for level in reversed(range(MAX_BOOK_ROWS)):
                                columns.append(
                                    ('%s_%s_%s_%i' %
                                     (exchange, side, feature, level)))
                        else:
                            for level in range(MAX_BOOK_ROWS):
                                columns.append(
                                    ('%s_%s_%s_%i' %
                                     (exchange, side, feature, level)))

            if include_spread:
                columns.append('{}_spread'.format(exchange))

            if include_imbalances:
                for level in range(MAX_BOOK_ROWS):
                    columns.append('notional_imbalance_{}'.format(level))
                columns.append('notional_imbalance_mean')
                columns.append('notional_imbalance_std')

        if isinstance(include_ema, list):
            tmp = list()
            for ema in include_ema:
                for col in columns:
                    if col == 'system_time':
                        continue
                    tmp.append('{}_{}'.format(col, ema))
            if include_system_time:
                tmp.insert(0, 'system_time')
            columns = tmp

        return columns

    def export_to_csv(self,
                      data: pd.DataFrame,
                      filename='BTC-USD_2019-01-01',
                      compress=True):
        """
        Export data within a Panda dataframe to a csv
        :param data: (panda.DataFrame) historical tick data
        :param filename: CCY_YYYY-MM-DD
        :param compress: Default True. If True, compress with xz
        :return: void
        """
        start_time = dt.now(tz=TIMEZONE)

        sub_folder = os.path.join(self.cwd, 'data_exports', filename) + '.csv'

        if compress:
            sub_folder += '.xz'
            data.to_csv(path_or_buf=sub_folder, index=False, compression='xz')
        else:
            data.to_csv(path_or_buf=sub_folder, index=False)

        elapsed = (dt.now(tz=TIMEZONE) - start_time).seconds
        print('Exported %s with %i rows in %i seconds' %
              (sub_folder, data.shape[0], elapsed))

    @staticmethod
    def import_csv(filename: str) -> pd.DataFrame:
        """
        Import an historical tick file created from the
        export_to_csv() function
        :param filename: Full file path including filename
        :return: (panda.DataFrame) historical limit order book data
        """
        start_time = dt.now(tz=TIMEZONE)

        if 'xz' in filename:
            data = pd.read_csv(filepath_or_buffer=filename,
                               index_col=0,
                               compression='xz',
                               engine='c')
        elif 'csv' in filename:
            data = pd.read_csv(filepath_or_buffer=filename,
                               index_col=0,
                               engine='c')
        else:
            print('Error: file must be a csv or xz')
            data = None

        elapsed = (dt.now(tz=TIMEZONE) - start_time).seconds
        print('Imported %s from a csv in %i seconds' %
              (filename[-21:], elapsed))
        return data

    def fit_scaler(self, orderbook_snapshot_history: pd.DataFrame):
        """
        Scale limit order book data for the neural network
        :param orderbook_snapshot_history: Limit order book data
                from the previous day
        :return: (void)
        """
        self._scaler.fit(orderbook_snapshot_history)

    def scale_data(self, data: pd.DataFrame):
        """
        Normalize data
        :param data: (np.array) all data in environment
        :return: (np.array) normalized observation space
        """
        return self._scaler.transform(data)

    @staticmethod
    def _midpoint_diff(data: pd.DataFrame):
        """
        Take log difference of midpoint prices
                log(price t) - log(price t-1)
        :param data: (pd.DataFrame) raw data from LOB snapshots
        :return: (pd.DataFrame) with midpoint prices normalized
        """
        data['coinbase_midpoint'] = np.log(data['coinbase_midpoint'].values)
        data['coinbase_midpoint'] = (
            data['coinbase_midpoint'] -
            data['coinbase_midpoint'].shift(1)).fillna(method='bfill')
        return data

    @staticmethod
    def _spread_calc(data: pd.DataFrame) -> pd.DataFrame:
        """
        Derive the spread and normalize it by a multiple of the market order fee.
        :param data: (pd.DataFrame) data set containing a bid and ask
        :return: data with spread added as the last column
        """
        # calculate the spread in real terms ('+' because bid_distances are all negative)
        data['coinbase_spread'] = data['coinbase_ask_distance_0'].values + \
                                  data['coinbase_bid_distance_0'].values
        return data

    @staticmethod
    def _get_order_imbalance(data: pd.DataFrame):
        """
        Calculate order imbalances per price level, their mean & standard deviation.

        Order Imbalances are calculated by:
            = (bid_quantity - ask_quantity) / (bid_quantity + ask_quantity)

        ...thus scale from [-1, 1].

        :param data: raw/unnormalized LOB snapshot data
        :return: (pd.DataFrame) order imbalances at N-levels, the mean & std imbalance
        """
        # create the column names for making a data frame (also used for debugging)
        bid_notional_columns, ask_notional_columns, imbalance_columns = [], [], []
        for i in range(MAX_BOOK_ROWS):
            bid_notional_columns.append('coinbase_bid_notional_{}'.format(i))
            ask_notional_columns.append('coinbase_ask_notional_{}'.format(i))
            imbalance_columns.append('notional_imbalance_{}'.format(i))
        # acquire bid and ask notional data
        bid_notional = data[
            bid_notional_columns].values[::-1]  # reverse the bids to
        # ascending order, so that they align with the asks
        ask_notional = data[ask_notional_columns].values
        # calculate the order imbalance
        imbalances = (bid_notional - ask_notional) / (bid_notional +
                                                      ask_notional)
        imbalances = pd.DataFrame(imbalances,
                                  columns=imbalance_columns).fillna(0.)
        # add meta data to features (mean and std)
        imbalances['notional_imbalance_mean'] = imbalances[
            imbalance_columns].mean(axis=1)
        imbalances['notional_imbalance_std'] = imbalances[
            imbalance_columns].std(axis=1)
        return imbalances

    def load_environment_data(self,
                              fitting_file: str,
                              testing_file: str,
                              include_imbalances: bool = True,
                              as_pandas: bool = False):
        """
        Import and scale environment data set with prior day's data.

        Midpoint gets log-normalized:
            log(price t) - log(price t-1)

        :param fitting_file: prior trading day
        :param testing_file: current trading day
        :param include_imbalances: if TRUE, include LOB imbalances
        :param as_pandas: if TRUE, return data as DataFrame, otherwise np.array
        :return: (pd.DataFrame or np.array) scaled environment data
        """
        # import data used to fit scaler
        fitting_data_filepath = os.path.join(self.cwd, 'data_exports',
                                             fitting_file)
        fitting_data = self.import_csv(filename=fitting_data_filepath)
        # check if bitfinex data is in the data set
        include_bitfinex = 'bitfinex' in fitting_data.columns.tolist()
        # carry on with data import process
        fitting_data = self._midpoint_diff(
            data=fitting_data)  # normalize midpoint
        fitting_data = self._spread_calc(data=fitting_data)  # normalize spread
        fitting_data = apply_ema_all_data(ema=self.ema, data=fitting_data)
        self.fit_scaler(fitting_data)
        del fitting_data

        # import data to normalize and use in environment
        data_used_in_environment = os.path.join(self.cwd, 'data_exports',
                                                testing_file)
        data = self.import_csv(filename=data_used_in_environment)
        midpoint_prices = data['coinbase_midpoint']

        normalized_data = self._midpoint_diff(data.copy(deep=True))
        normalized_data = self._spread_calc(
            data=normalized_data)  # normalize spread
        normalized_data = apply_ema_all_data(ema=self.ema,
                                             data=normalized_data)

        normalized_data = self.scale_data(normalized_data)
        normalized_data = np.clip(normalized_data, -10, 10)
        normalized_data = pd.DataFrame(normalized_data,
                                       columns=self.get_feature_labels(
                                           include_system_time=False,
                                           include_bitfinex=include_bitfinex,
                                           include_spread=True,
                                           include_imbalances=False,
                                           include_ema=self.alpha))

        if include_imbalances:
            print('Adding order imbalances...')
            # Note: since order imbalance data is scaled [-1, 1], we do not apply
            # z-score to the imbalance data
            imbalance_data = self._get_order_imbalance(data=data)
            imbalance_data = apply_ema_all_data(ema=reset_ema(self.ema),
                                                data=imbalance_data)
            normalized_data = pd.concat((normalized_data, imbalance_data),
                                        axis=1)

        if as_pandas is False:
            midpoint_prices = midpoint_prices.values
            data = data.values
            normalized_data = normalized_data.values

        return midpoint_prices, data, normalized_data

    @staticmethod
    def _get_microsecond_delta(new_tick_time: dt, last_snapshot_time: dt):
        """
        Calculate difference between two consecutive ticks.
        Note: only tracks timedelta for up to a minute.
        :param new_tick_time: datetime of incoming tick
        :param last_snapshot_time: datetime of last LOB snapshot
        :return: (int) delta between ticks
        """
        if last_snapshot_time > new_tick_time:
            return -1
        snapshot_tick_time_delta = new_tick_time - last_snapshot_time
        seconds = snapshot_tick_time_delta.seconds * 1000000
        microseconds = snapshot_tick_time_delta.microseconds
        # print("seconds={} | microseconds={}".format(seconds, microseconds))
        return seconds + microseconds

    def get_orderbook_snapshot_history(self, query: dict):
        """
        Function to replay historical market data and generate
        the features used for reinforcement learning & training.

        NOTE:
        The query can either be a single Coinbase CCY, or both Coinbase and Bitfinex,
        but it cannot be only a Biftinex CCY. Later releases of this repo will
        support Bitfinex only orderbook reconstruction.

        :param query: (dict) query for finding tick history in Arctic TickStore
        :return: (pd.DataFrame) snapshots of limit order books using a
                stationary feature set
        """
        self.db.init_db_connection()
        tick_history = self.db.get_tick_history(query=query)
        if tick_history is None:
            print("Query returned no data: {}".format(query))
            return None

        loop_length = tick_history.shape[0]

        # number of microseconds between LOB snapshots
        snapshot_interval_milliseconds = SNAPSHOT_RATE_IN_MICROSECONDS // 1000

        snapshot_list = list()
        last_snapshot_time = None

        symbols = query['ccy']
        print('querying {}'.format(symbols))

        include_bitfinex = len(symbols) > 1
        if include_bitfinex:
            print('\n\nIncluding Bitfinex data in feature set.\n\n')

        coinbase_order_book = CoinbaseOrderBook(symbols[0])
        bitfinex_order_book = BitfinexOrderBook(symbols[1]) if include_bitfinex \
            else None

        start_time = dt.now(TIMEZONE)
        print(
            'Starting get_orderbook_snapshot_history() loop with %i ticks for %s'
            % (loop_length, query['ccy']))

        # loop through all ticks returned from the Arctic Tick Store query.
        for count, tx in enumerate(tick_history.itertuples()):

            # periodically print number of steps completed
            if count % 250000 == 0:
                elapsed = (dt.now(TIMEZONE) - start_time).seconds
                print('...completed %i loops in %i seconds' % (count, elapsed))

            # convert to dictionary for processing
            tick = tx._asdict()

            # determine if incoming tick is from coinbase or bitfinex
            coinbase = True if tick['product_id'] == coinbase_order_book.sym else \
                False

            # filter out bad ticks
            if 'type' not in tick:
                continue

            # flags for a order book reset
            if tick['type'] in ['load_book', 'book_loaded', 'preload']:
                if coinbase:
                    coinbase_order_book.new_tick(tick)
                else:
                    bitfinex_order_book.new_tick(tick)
                # skip to next loop
                continue

            # incoming tick is for coinbase LOB
            if coinbase:
                # check if the LOB is pre-loaded, if not skip message and do NOT process.
                if coinbase_order_book.done_warming_up() is False:
                    print("coinbase_order_book not done warming up: {}".format(
                        tick))
                    continue

                # timestamp for incoming tick
                new_tick_time = parse(tick.get('time'))

                # remove ticks without timestamps (should not exist/happen)
                if new_tick_time is None:
                    print('No tick time: {}'.format(tick))
                    continue

                # initialize the LOB snapshot timer
                if last_snapshot_time is None:
                    # process first ticks and check if they're stale ticks; if so,
                    # skip to the next loop.
                    coinbase_order_book.new_tick(tick)
                    last_coinbase_tick_time = coinbase_order_book.last_tick_time
                    if last_coinbase_tick_time is None:
                        continue
                    last_coinbase_tick_time_dt = parse(last_coinbase_tick_time)
                    last_snapshot_time = last_coinbase_tick_time_dt
                    print('{} first tick: {} | Sequence: {}'.format(
                        coinbase_order_book.sym, new_tick_time,
                        coinbase_order_book.sequence))
                    # skip to next loop
                    continue

                # calculate the amount of time between the incoming
                #   tick and tick received before that
                diff = self._get_microsecond_delta(new_tick_time,
                                                   last_snapshot_time)

                # update the LOB, but do not take a LOB snapshot if the tick time is
                # out of sequence. This occurs when pre-loading a LOB with stale tick
                # times in general.
                if diff == -1:
                    coinbase_order_book.new_tick(tick)
                    continue

                # derive the number of LOB snapshot insertions for the data buffer.
                multiple = diff // SNAPSHOT_RATE_IN_MICROSECONDS  # 1000000 is 1 second

                # proceed if we have one or more insertions to make
                if multiple <= 0:
                    coinbase_order_book.new_tick(tick)
                    continue

                # check to include Bitfinex data in features.
                if include_bitfinex:
                    # if bitfinex's LOB is still loading, do NOT export snapshots
                    # of coinbase in the meantime and continue to next loop.
                    if bitfinex_order_book.done_warming_up() is False:
                        print("bitfinex_order_book not done warming up: {}".
                              format(tick))
                        coinbase_order_book.new_tick(tick)
                        # update the LOB snapshot tracker.
                        for _ in range(multiple):
                            last_snapshot_time += timedelta(
                                milliseconds=snapshot_interval_milliseconds)
                        # move to next loop and see if bitfinex's LOB is ready then.
                        continue

                    # since both coinbase and bitfinex LOBs are assumed to be
                    # pre-loaded at this point, we can proceed to export snapshots
                    # of the LOB, even if there has been a 'long' duration between
                    # consecutive ticks.
                    coinbase_order_book_snapshot = coinbase_order_book.render_book(
                    )
                    bitfinex_order_book_snapshot = bitfinex_order_book.render_book(
                    )
                    midpoint_delta = coinbase_order_book.midpoint - \
                        bitfinex_order_book.midpoint

                    # update the LOB snapshot time-delta AND add LOB snapshots to the
                    # data buffer.
                    for i in range(multiple):
                        last_snapshot_time += timedelta(
                            milliseconds=snapshot_interval_milliseconds)
                        snapshot_list.append(
                            np.hstack((
                                last_snapshot_time,
                                coinbase_order_book.midpoint,  # midpoint price
                                midpoint_delta,  # price delta between exchanges
                                coinbase_order_book_snapshot,
                                bitfinex_order_book_snapshot)))  # longs/shorts

                    # update order book with most recent tick now, so the snapshots
                    # are up to date for the next iteration of the loop.
                    coinbase_order_book.new_tick(tick)
                    continue
                else:  # do not include bitfinex
                    coinbase_order_book_snapshot = coinbase_order_book.render_book(
                    )
                    for i in range(multiple):
                        last_snapshot_time += timedelta(
                            milliseconds=snapshot_interval_milliseconds)
                        snapshot_list.append(
                            np.hstack((last_snapshot_time,
                                       coinbase_order_book.midpoint,
                                       coinbase_order_book_snapshot)))

                    # update order book with most recent tick now, so the snapshots
                    # are up to date for the next iteration of the loop.
                    coinbase_order_book.new_tick(tick)
                    continue

            # incoming tick is from Bitfinex exchange
            elif include_bitfinex and bitfinex_order_book.done_warming_up():
                bitfinex_order_book.new_tick(tick)
                continue

        elapsed = (dt.now(TIMEZONE) - start_time).seconds
        print('Completed run_simulation() with %i ticks in %i seconds '
              'at %i ticks/second' %
              (loop_length, elapsed, loop_length // elapsed))

        orderbook_snapshot_history = pd.DataFrame(
            snapshot_list,
            columns=self.get_feature_labels(
                include_system_time=True,
                include_spread=False,
                include_bitfinex=include_bitfinex,
                include_order_flow=INCLUDE_ORDERFLOW,
                include_imbalances=False,
                include_ema=self.alpha))
        orderbook_snapshot_history = orderbook_snapshot_history.dropna(axis=0)

        return orderbook_snapshot_history

    def extract_features(self, query: dict):
        """
        Create and export limit order book data to csv. This function
        exports multiple days of data and ensures each day starts and
        ends exactly on time.
        :param query: (dict) ccy=sym, daterange=(YYYYMMDD,YYYYMMDD)
        :return: void
        """
        start_time = dt.now(tz=TIMEZONE)

        order_book_data = self.get_orderbook_snapshot_history(query=query)
        if order_book_data is not None:
            dates = order_book_data['system_time'].dt.date.unique()
            print('dates: {}'.format(dates))
            for date in dates[1:]:
                tmp = order_book_data.loc[
                    order_book_data['system_time'].dt.date == date]
                self.export_to_csv(tmp,
                                   filename='{}_{}'.format(
                                       query['ccy'][0], date),
                                   compress=True)

        elapsed = (dt.now(tz=TIMEZONE) - start_time).seconds
        print('***\nSimulator.extract_features() executed in %i seconds\n***' %
              elapsed)