def download_with_progress(url, chunk_size, **progress_kwargs): """ Download streaming data from a URL, printing progress information to the terminal. Parameters ---------- url : str A URL that can be understood by ``requests.get``. chunk_size : int Number of bytes to read at a time from requests. **progress_kwargs Forwarded to click.progressbar. Returns ------- data : BytesIO A BytesIO containing the downloaded data. """ resp = requests.get(url, stream=True) resp.raise_for_status() total_size = int(resp.headers['content-length']) data = BytesIO() progress_kwargs['length'] = total_size with maybe_show_progress(None, True, **progress_kwargs) as pbar: for chunk in resp.iter_content(chunk_size=chunk_size): data.write(chunk) pbar.update(len(chunk)) data.seek(0) return data
def write(self, data, length=None, show_progress=False, invalid_data_behavior='warn'): """Write a stream of minute data. Parameters ---------- data : iterable[(int, pd.DataFrame)] The data to write. Each element should be a tuple of sid, data where data has the following format: columns : ('open', 'high', 'low', 'close', 'volume') open : float64 high : float64 low : float64 close : float64 volume : float64|int64 index : DatetimeIndex of market minutes. A given sid may appear more than once in ``data``; however, the dates must be strictly increasing. show_progress : bool, optional Whether or not to show a progress bar while writing. """ with maybe_show_progress( data, length=length, show_percent=False, show_progress=show_progress, item_show_func=item_show_count(length), label='Compiling five-minute data', ) as it: write_sid = self.write_sid for e in it: write_sid(*e, invalid_data_behavior=invalid_data_behavior)
def write(self, data, show_progress=False, invalid_data_behavior='warn'): """Write a stream of minute data. Parameters ---------- data : iterable[(int, pd.DataFrame)] The data to write. Each element should be a tuple of sid, data where data has the following format: columns : ('open', 'high', 'low', 'close', 'volume') open : float64 high : float64 low : float64 close : float64 volume : float64|int64 index : DatetimeIndex of market minutes. A given sid may appear more than once in ``data``; however, the dates must be strictly increasing. show_progress : bool, optional Whether or not to show a progress bar while writing. """ ctx = maybe_show_progress( data, show_progress=show_progress, item_show_func=lambda e: e if e is None else str(e[0]), label="Merging minute equity files:", ) write_sid = self.write_sid with ctx as it: for e in it: write_sid(*e, invalid_data_behavior=invalid_data_behavior)
def write(self, data, show_progress=False, invalid_data_behavior='warn'): """Write a stream of minute data. Parameters ---------- data : iterable[(int, pd.DataFrame)] The data to write. Each element should be a tuple of sid, data where data has the following format: columns : ('open', 'high', 'low', 'close', 'volume') open : float64 high : float64 low : float64 close : float64 volume : float64|int64 index : DatetimeIndex of market minutes. A given sid may appear more than once in ``data``; however, the dates must be strictly increasing. show_progress : bool, optional Whether or not to show a progress bar while writing. """ ctx = maybe_show_progress( data, show_progress=show_progress, item_show_func=lambda e: e if e is None else str(e[0]), label="Merging minute equity files:", ) write_sid = self.write_sid with ctx as it: for e in it: write_sid(*e, invalid_data_behavior=invalid_data_behavior)
def download_with_progress(url, chunk_size, **progress_kwargs): """ Download streaming data from a URL, printing progress information to the terminal. Parameters ---------- url : str A URL that can be understood by ``requests.get``. chunk_size : int Number of bytes to read at a time from requests. **progress_kwargs Forwarded to click.progressbar. Returns ------- data : BytesIO A BytesIO containing the downloaded data. """ resp = requests.get(url, stream=True) resp.raise_for_status() total_size = int(resp.headers['content-length']) data = BytesIO() progress_kwargs['length'] = total_size with maybe_show_progress(None, True, **progress_kwargs) as pbar: for chunk in resp.iter_content(chunk_size=chunk_size): data.write(chunk) pbar.update(len(chunk)) data.seek(0) return data
def ingest_assets(self, assets, start_dt, end_dt, data_frequency, show_progress=False): """ Determine if data is missing from the bundle and attempt to ingest it. :param assets: :param start_dt: :param end_dt: :return: """ writer = self.get_writer(start_dt, end_dt, data_frequency) chunks = self.prepare_chunks(assets=assets, data_frequency=data_frequency, start_dt=start_dt, end_dt=end_dt) with maybe_show_progress( chunks, show_progress, label='Fetching {exchange} {frequency} candles: '.format( exchange=self.exchange.name, frequency=data_frequency)) as it: for chunk in it: self.ingest_ctable(asset=chunk['asset'], data_frequency=data_frequency, period=chunk['period'], start_dt=chunk['period_start'], end_dt=chunk['period_end'], writer=writer, empty_rows_behavior='strip')
def _post_process_metadata(self, metadata, cache, show_progress=False): # Create empty data frame using target metadata column names and dtypes final_metadata = pd.DataFrame( columns=self.md_column_names, index=metadata.index, ) # Iterate over the available symbols, loading the asset's raw symbol # data from the cache. The final metadata is computed and recorded in # the appropriate row depending on the asset's id. with maybe_show_progress( metadata.symbol.iteritems(), show_progress, label='Post-processing symbol metadata', item_show_func=item_show_count(len(metadata)), length=len(metadata), show_percent=False, ) as symbols_map: for asset_id, symbol in symbols_map: # Attempt to load data from disk, the cache should have an entry # for each symbol at this point of the execution. If one does # not exist, we should fail. key = '{sym}.daily.frame'.format(sym=symbol) try: raw_data = cache[key] except KeyError: raise ValueError( 'Unable to find cached data for symbol: {0}'.format( symbol)) # Perform and require post-processing of metadata. final_symbol_metadata = self.post_process_symbol_metadata( asset_id, metadata.iloc[asset_id], raw_data, ) # Record symbol's final metadata. final_metadata.iloc[asset_id] = final_symbol_metadata # Register all assets with the bundle's default exchange. final_metadata['exchange'] = self.exchange return final_metadata
def _post_process_metadata(self, metadata, cache, show_progress=False): # Create empty data frame using target metadata column names and dtypes final_metadata = pd.DataFrame( columns=self.md_column_names, index=metadata.index, ) # Iterate over the available symbols, loading the asset's raw symbol # data from the cache. The final metadata is computed and recorded in # the appropriate row depending on the asset's id. with maybe_show_progress( metadata.symbol.iteritems(), show_progress, label='Post-processing symbol metadata', item_show_func=item_show_count(len(metadata)), length=len(metadata), show_percent=False, ) as symbols_map: for asset_id, symbol in symbols_map: # Attempt to load data from disk, the cache should have an # entry for each symbol at this point of the execution. If one # does not exist, we should fail. key = '{sym}.daily.frame'.format(sym=symbol) try: raw_data = cache[key] except KeyError: raise ValueError( 'Unable to find cached data for symbol:' ' {0}'.format(symbol)) # Perform and require post-processing of metadata. final_symbol_metadata = self.post_process_symbol_metadata( asset_id, metadata.iloc[asset_id], raw_data, ) # Record symbol's final metadata. final_metadata.iloc[asset_id] = final_symbol_metadata # Register all assets with the bundle's default exchange. final_metadata['exchange'] = self.exchange return final_metadata
def write(self, data, assets=None, show_progress=False, invalid_data_behavior='warn'): """ Parameters ---------- data : iterable[tuple[int, pandas.DataFrame or bcolz.ctable]] The data chunks to write. Each chunk should be a tuple of sid and the data for that asset. assets : set[int], optional The assets that should be in ``data``. If this is provided we will check ``data`` against the assets and provide better progress information. show_progress : bool, optional Whether or not to show a progress bar while writing. invalid_data_behavior : {'warn', 'raise', 'ignore'}, optional What to do when data is encountered that is outside the range of a uint64. Returns ------- table : bcolz.ctable The newly-written table. """ total = None if assets is None else len(assets) ctx = maybe_show_progress( ( (sid, self.to_ctable(df, invalid_data_behavior)) for sid, df in data ), show_progress=show_progress, label=self.progress_bar_message, item_show_func=item_show_count(total), length=total, show_percent=False, ) with ctx as it: return self._write_internal(it, assets)
def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo pricing data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'ohlcv') try: df = cache[path] except KeyError: df = cache[path] = DataReader( symbol, 'yahoo', start, end, session=session, ).sort_index() # the start date is the date of the first trade and # the end date is the date of the last trade start_date = df.index[0] end_date = df.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol df.rename( columns={ 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume', }, inplace=True, ) yield sid, df sid += 1
def write(self, data, assets=None, show_progress=False, invalid_data_behavior='warn'): """ Parameters ---------- data : iterable[tuple[int, pandas.DataFrame or bcolz.ctable]] The data chunks to write. Each chunk should be a tuple of sid and the data for that asset. assets : set[int], optional The assets that should be in ``data``. If this is provided we will check ``data`` against the assets and provide better progress information. show_progress : bool, optional Whether or not to show a progress bar while writing. invalid_data_behavior : {'warn', 'raise', 'ignore'}, optional What to do when data is encountered that is outside the range of a uint64. Returns ------- table : bcolz.ctable The newly-written table. """ total = None if assets is None else len(assets) ctx = maybe_show_progress( ( (sid, self.to_ctable(df, invalid_data_behavior)) for sid, df in data ), show_progress=show_progress, label=self.progress_bar_message, item_show_func=item_show_count(total), length=total, show_percent=False, ) with ctx as it: return self._write_internal(it, assets)
def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo pricing data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'ohlcv') try: df = cache[path] except KeyError: df = cache[path] = DataReader( symbol, 'yahoo', start, end, session=session, ).sort_index() # the start date is the date of the first trade and # the end date is the date of the last trade start_date = df.index[0] end_date = df.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol df.rename( columns={ 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume', }, inplace=True, ) yield sid, df sid += 1
def _fetch_metadata_frame(self, api_key, cache, retries=DEFAULT_RETRIES, environ=None, show_progress=False): # Setup raw metadata iterator to fetch pages if necessary. raw_iter = self._fetch_metadata_iter(api_key, cache, retries, environ) # Concatenate all frame in iterator to compute a single metadata frame. with maybe_show_progress( raw_iter, show_progress, label='Fetching symbol metadata', item_show_func=item_show_count(), length=3, show_percent=False, ) as blocks: metadata = pd.concat(blocks, ignore_index=True) return metadata
def _fetch_metadata_frame(self, api_key, cache, retries=DEFAULT_RETRIES, environ=None, show_progress=False): # Setup raw metadata iterator to fetch pages if necessary. raw_iter = self._fetch_metadata_iter(api_key, cache, retries, environ) # Concatenate all frame in iterator to compute a single metadata frame. with maybe_show_progress( raw_iter, show_progress, label='Fetching symbol metadata', item_show_func=item_show_count(), length=3, show_percent=False, ) as blocks: metadata = pd.concat(blocks, ignore_index=True) return metadata
def ingest_assets(self, assets, data_frequency, start_dt=None, end_dt=None, show_progress=False, show_breakdown=False, show_report=False): """ Determine if data is missing from the bundle and attempt to ingest it. Parameters ---------- assets: list[TradingPair] data_frequency: str start_dt: pd.Timestamp end_dt: pd.Timestamp show_progress: bool show_breakdown: bool """ if start_dt is None: start_dt = self.calendar.first_session if end_dt is None: end_dt = pd.Timestamp.utcnow() get_start_end = get_month_start_end \ if data_frequency == 'minute' else get_year_start_end # Assign the first and last day of the period start_dt, _ = get_start_end(start_dt) _, end_dt = get_start_end(end_dt) chunks = self.prepare_chunks(assets=assets, data_frequency=data_frequency, start_dt=start_dt, end_dt=end_dt) problems = [] # This is the common writer for the entire exchange bundle # we want to give an end_date far in time writer = self.get_writer(start_dt, end_dt, data_frequency) if show_breakdown: for asset in chunks: with maybe_show_progress( chunks[asset], show_progress, label='Ingesting {frequency} price data for ' '{symbol} on {exchange}'.format( exchange=self.exchange_name, frequency=data_frequency, symbol=asset.symbol)) as it: for chunk in it: problems += self.ingest_ctable( asset=chunk['asset'], data_frequency=data_frequency, period=chunk['period'], writer=writer, empty_rows_behavior='strip', cleanup=True) else: all_chunks = list(chain.from_iterable(itervalues(chunks))) # We sort the chunks by end date to ingest most recent data first all_chunks.sort(key=lambda chunk: pd.to_datetime(chunk['period'])) with maybe_show_progress( all_chunks, show_progress, label='Ingesting {frequency} price data on ' '{exchange}'.format( exchange=self.exchange_name, frequency=data_frequency, )) as it: for chunk in it: problems += self.ingest_ctable( asset=chunk['asset'], data_frequency=data_frequency, period=chunk['period'], writer=writer, empty_rows_behavior='strip', cleanup=True) if show_report and len(problems) > 0: log.info('problems during ingestion:{}\n'.format( '\n'.join(problems)))
def fetch_symbol_metadata_frame(api_key, cache, retries=5, environ=None, show_progress=False): """ Download Quandl symbol metadata. Parameters ---------- api_key : str The quandl api key to use. If this is None then no api key will be sent. cache : DataFrameCache The cache to use for persisting the intermediate data. retries : int, optional The number of times to retry each request before failing. environ : mapping[str -> str], optional The environment to use to find the catalyst home. By default this is ``os.environ``. show_progress : bool, optional Show a progress bar for the download of this data. Returns ------- metadata_frame : pd.DataFrame A dataframe with the following columns: symbol: the asset's symbol name: the full name of the asset start_date: the first date of data for this asset end_date: the last date of data for this asset auto_close_date: end_date + one day exchange: the exchange for the asset; this is always 'quandl' The index of the dataframe will be used for symbol->sid mappings but otherwise does not have specific meaning. """ raw_iter = _fetch_raw_metadata(api_key, cache, retries, environ) def item_show_func(_, _it=iter(count())): 'Downloading page: %d' % next(_it) with maybe_show_progress(raw_iter, show_progress, item_show_func=item_show_func, label='Downloading WIKI metadata: ') as blocks: data = pd.concat(blocks, ignore_index=True).rename( columns={ 'dataset_code': 'symbol', 'name': 'asset_name', 'oldest_available_date': 'start_date', 'newest_available_date': 'end_date', }).sort_values('symbol') data = data[~data.symbol.isin(excluded_symbols)] # cut out all the other stuff in the name column # we need to escape the paren because it is actually splitting on a regex data.asset_name = data.asset_name.str.split(r' \(', 1).str.get(0) data['exchange'] = 'QUANDL' data['start_date'] = data['start_date'].astype(datetime) data['end_date'] = data['end_date'].astype(datetime) data['auto_close_date'] = data['end_date'] + pd.Timedelta(days=1) return data
def ingest( environ, asset_db_writer, minute_bar_writer, # unused daily_bar_writer, adjustment_writer, calendar, start_session, end_session, cache, show_progress, output_dir, # pass these as defaults to make them 'nonlocal' in py2 start=start, end=end): if start is None: start = start_session if end is None: end = None metadata = pd.DataFrame( np.empty(len(symbols), dtype=[ ('start_date', 'datetime64[ns]'), ('end_date', 'datetime64[ns]'), ('auto_close_date', 'datetime64[ns]'), ('symbol', 'object'), ])) def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo pricing data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'ohlcv') try: df = cache[path] except KeyError: df = cache[path] = DataReader( symbol, 'yahoo', start, end, session=session, ).sort_index() # the start date is the date of the first trade and # the end date is the date of the last trade start_date = df.index[0] end_date = df.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol df.rename( columns={ 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume', }, inplace=True, ) yield sid, df sid += 1 daily_bar_writer.write(_pricing_iter(), show_progress=show_progress) symbol_map = pd.Series(metadata.symbol.index, metadata.symbol) # Hardcode the exchange to "YAHOO" for all assets and (elsewhere) # register "YAHOO" to resolve to the NYSE calendar, because these are # all equities and thus can use the NYSE calendar. metadata['exchange'] = "YAHOO" asset_db_writer.write(equities=metadata) adjustments = [] with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo adjustment data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'adjustment') try: df = cache[path] except KeyError: df = cache[path] = DataReader( symbol, 'yahoo-actions', start, end, session=session, ).sort_index() df['sid'] = symbol_map[symbol] adjustments.append(df) adj_df = pd.concat(adjustments) adj_df.index.name = 'date' adj_df.reset_index(inplace=True) splits = adj_df[adj_df.action == 'SPLIT'] splits = splits.rename(columns={ 'value': 'ratio', 'date': 'effective_date' }, ) splits.drop('action', axis=1, inplace=True) dividends = adj_df[adj_df.action == 'DIVIDEND'] dividends = dividends.rename(columns={ 'value': 'amount', 'date': 'ex_date' }, ) dividends.drop('action', axis=1, inplace=True) # we do not have this data in the yahoo dataset dividends['record_date'] = pd.NaT dividends['declared_date'] = pd.NaT dividends['pay_date'] = pd.NaT adjustment_writer.write(splits=splits, dividends=dividends)
def ingest(environ, asset_db_writer, minute_bar_writer, # unused daily_bar_writer, adjustment_writer, calendar, start_session, end_session, cache, show_progress, output_dir, # pass these as defaults to make them 'nonlocal' in py2 start=start, end=end): if start is None: start = start_session if end is None: end = None metadata = pd.DataFrame(np.empty(len(symbols), dtype=[ ('start_date', 'datetime64[ns]'), ('end_date', 'datetime64[ns]'), ('auto_close_date', 'datetime64[ns]'), ('symbol', 'object'), ])) def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo pricing data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'ohlcv') try: df = cache[path] except KeyError: df = cache[path] = DataReader( symbol, 'yahoo', start, end, session=session, ).sort_index() # the start date is the date of the first trade and # the end date is the date of the last trade start_date = df.index[0] end_date = df.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol df.rename( columns={ 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume', }, inplace=True, ) yield sid, df sid += 1 daily_bar_writer.write(_pricing_iter(), show_progress=show_progress) symbol_map = pd.Series(metadata.symbol.index, metadata.symbol) # Hardcode the exchange to "YAHOO" for all assets and (elsewhere) # register "YAHOO" to resolve to the NYSE calendar, because these are # all equities and thus can use the NYSE calendar. metadata['exchange'] = "YAHOO" asset_db_writer.write(equities=metadata) adjustments = [] with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo adjustment data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'adjustment') try: df = cache[path] except KeyError: df = cache[path] = DataReader( symbol, 'yahoo-actions', start, end, session=session, ).sort_index() df['sid'] = symbol_map[symbol] adjustments.append(df) adj_df = pd.concat(adjustments) adj_df.index.name = 'date' adj_df.reset_index(inplace=True) splits = adj_df[adj_df.action == 'SPLIT'] splits = splits.rename( columns={'value': 'ratio', 'date': 'effective_date'}, ) splits.drop('action', axis=1, inplace=True) dividends = adj_df[adj_df.action == 'DIVIDEND'] dividends = dividends.rename( columns={'value': 'amount', 'date': 'ex_date'}, ) dividends.drop('action', axis=1, inplace=True) # we do not have this data in the yahoo dataset dividends['record_date'] = pd.NaT dividends['declared_date'] = pd.NaT dividends['pay_date'] = pd.NaT adjustment_writer.write(splits=splits, dividends=dividends)