def bcolz_exchange_daily_write_read(self, exchange_name): start = pd.to_datetime('2017-10-01 00:00') end = pd.to_datetime('today') freq = 'daily' bundle = ExchangeBundle(exchange_name) df = self.generate_df(exchange_name, freq, start, end) print(df.index[0], df.index[-1]) writer = BcolzExchangeBarWriter(rootdir=self.root_dir, start_session=df.index[0], end_session=df.index[-1], data_frequency=freq, write_metadata=True) data = [] data.append((1, df)) writer.write(data) reader = BcolzExchangeBarReader(rootdir=self.root_dir, data_frequency=freq) arrays = reader.load_raw_arrays(self.columns, start, end, [ 1, ]) periods = bundle.get_calendar_periods_range(start, end, freq) dx = get_df_from_arrays(arrays, periods) assert_equals(df.equals(dx), True) pass
def chunk_to_df(self, exchange_name, symbol, data_frequency, period): exchange = get_exchange(exchange_name) asset = exchange.get_asset(symbol) filename = get_bcolz_chunk( exchange_name=exchange_name, symbol=symbol, data_frequency=data_frequency, period=period ) reader = BcolzExchangeBarReader(rootdir=filename, data_frequency=data_frequency) # metadata = BcolzMinuteBarMetadata.read(filename) start = reader.first_trading_day end = reader.last_available_dt if data_frequency == 'daily': end = end - pd.Timedelta(hours=23, minutes=59) print(start, end, data_frequency) arrays = reader.load_raw_arrays(self.columns, start, end, [asset.sid, ]) bundle = ExchangeBundle(exchange_name) periods = bundle.get_calendar_periods_range( start, end, data_frequency ) return get_df_from_arrays(arrays, periods)
def bcolz_exchange_daily_write_read(self, exchange_name): start = pd.to_datetime('2017-10-01 00:00') end = pd.to_datetime('today') freq = 'daily' bundle = ExchangeBundle(exchange_name) df = self.generate_df(exchange_name, freq, start, end) print(df.index[0], df.index[-1]) writer = BcolzExchangeBarWriter( rootdir=self.root_dir, start_session=df.index[0], end_session=df.index[-1], data_frequency=freq, write_metadata=True) data = [] data.append((1, df)) writer.write(data) reader = BcolzExchangeBarReader(rootdir=self.root_dir, data_frequency=freq) arrays = reader.load_raw_arrays(self.columns, start, end, [1, ]) periods = bundle.get_calendar_periods_range( start, end, freq ) dx = get_df_from_arrays(arrays, periods) assert_equals(df.equals(dx), True) pass
def _bundle_to_csv(self, asset, exchange_name, data_frequency, filename, path=None, start_dt=None, end_dt=None): bundle = ExchangeBundle(exchange_name) reader = bundle.get_reader(data_frequency, path=path) if start_dt is None: start_dt = reader.first_trading_day if end_dt is None: end_dt = reader.last_available_dt if data_frequency == 'daily': end_dt = end_dt - pd.Timedelta(hours=23, minutes=59) arrays = None try: arrays = reader.load_raw_arrays( sids=[asset.sid], fields=['open', 'high', 'low', 'close', 'volume'], start_dt=start_dt, end_dt=end_dt) except Exception as e: log.warn('skipping ctable for {} from {} to {}: {}'.format( asset.symbol, start_dt, end_dt, e)) periods = bundle.get_calendar_periods_range(start_dt, end_dt, data_frequency) df = get_df_from_arrays(arrays, periods) folder = os.path.join(tempfile.gettempdir(), 'catalyst', exchange_name, asset.symbol) ensure_directory(folder) path = os.path.join(folder, filename + '.csv') log.info('creating csv file: {}'.format(path)) print('HEAD\n{}'.format(df.head(100))) print('TAIL\n{}'.format(df.tail(100))) df.to_csv(path) pass
def download_from_catalyst(self, asset, data_frequency, period): # Download and extract the bundle path = get_bcolz_chunk(exchange_name=self.exchange_name, symbol=asset.symbol, data_frequency=data_frequency, period=period) reader = self.get_reader(data_frequency, path=path) if reader is None: try: log.warn('the reader is unable to use bundle: {}, ' 'deleting it.'.format(path)) shutil.rmtree(path) except Exception as e: log.warn('unable to remove temp bundle: {}'.format(e)) raise TempBundleNotFoundError(path=path) start_dt = reader.first_trading_day end_dt = reader.last_available_dt if data_frequency == 'daily': end_dt = end_dt - pd.Timedelta(hours=23, minutes=59) arrays = None try: arrays = reader.load_raw_arrays( sids=[asset.sid], fields=['open', 'high', 'low', 'close', 'volume'], start_dt=start_dt, end_dt=end_dt) except Exception as e: log.warn('skipping ctable for {} from {} to {}: {}'.format( asset.symbol, start_dt, end_dt, e)) if not arrays: return reader._rootdir periods = self.get_calendar_periods_range(start_dt, end_dt, data_frequency) return get_df_from_arrays(arrays, periods), reader
def ingest_ctable(self, asset, data_frequency, period, writer, empty_rows_behavior='strip', duplicates_threshold=100, cleanup=False): """ Merge a ctable bundle chunk into the main bundle for the exchange. Parameters ---------- asset: TradingPair data_frequency: str period: str writer: empty_rows_behavior: str Ensure that the bundle does not have any missing data. cleanup: bool Remove the temp bundle directory after ingestion. Returns ------- list[str] A list of problems which occurred during ingestion. """ problems = [] # Download and extract the bundle path = get_bcolz_chunk( exchange_name=self.exchange_name, symbol=asset.symbol, data_frequency=data_frequency, period=period ) reader = self.get_reader(data_frequency, path=path) if reader is None: try: log.warn('the reader is unable to use bundle: {}, ' 'deleting it.'.format(path)) shutil.rmtree(path) except Exception as e: log.warn('unable to remove temp bundle: {}'.format(e)) raise TempBundleNotFoundError(path=path) start_dt = reader.first_trading_day end_dt = reader.last_available_dt if data_frequency == 'daily': end_dt = end_dt - pd.Timedelta(hours=23, minutes=59) arrays = None try: arrays = reader.load_raw_arrays( sids=[asset.sid], fields=['open', 'high', 'low', 'close', 'volume'], start_dt=start_dt, end_dt=end_dt ) except Exception as e: log.warn('skipping ctable for {} from {} to {}: {}'.format( asset.symbol, start_dt, end_dt, e )) if not arrays: return reader._rootdir periods = self.get_calendar_periods_range( start_dt, end_dt, data_frequency ) df = get_df_from_arrays(arrays, periods) problems += self.ingest_df( ohlcv_df=df, data_frequency=data_frequency, asset=asset, writer=writer, empty_rows_behavior=empty_rows_behavior, duplicates_threshold=duplicates_threshold ) if cleanup: log.debug( 'removing bundle folder following ingestion: {}'.format( reader._rootdir) ) shutil.rmtree(reader._rootdir) return filter(partial(is_not, None), problems)