def test_register_non_decorator(self): ayy_lmao_set = set() def ayy_lmao(): return ayy_lmao_set self.register('ayy-lmao', ayy_lmao) expected_metrics_sets = mappingproxy({'ayy-lmao': ayy_lmao}) assert_equal(self.metrics_sets, expected_metrics_sets) assert_is(self.load('ayy-lmao'), ayy_lmao_set) def other(): # pragma: no cover raise AssertionError('dead') msg = "metrics set 'ayy-lmao' is already registered" with assert_raises_str(ValueError, msg): self.register('ayy-lmao', other) # ensure that the failed registration didn't break the previously # registered set assert_equal(self.metrics_sets, expected_metrics_sets) assert_is(self.load('ayy-lmao'), ayy_lmao_set) self.unregister('ayy-lmao') assert_equal(self.metrics_sets, mappingproxy({})) msg = "no metrics set registered as 'ayy-lmao', options are: []" with assert_raises_str(ValueError, msg): self.load('ayy-lmao') msg = "metrics set 'ayy-lmao' was not already registered" with assert_raises_str(ValueError, msg): self.unregister('ayy-lmao')
def test_register_non_decorator(self, metrics): ayy_lmao_set = set() def ayy_lmao(): return ayy_lmao_set metrics.register("ayy-lmao", ayy_lmao) expected_metrics_sets = mappingproxy({"ayy-lmao": ayy_lmao}) assert metrics.metrics_sets == expected_metrics_sets assert metrics.load("ayy-lmao") is ayy_lmao_set def other(): # pragma: no cover raise AssertionError("dead") msg = "metrics set 'ayy-lmao' is already registered" with pytest.raises(ValueError, match=msg): metrics.register("ayy-lmao", other) # ensure that the failed registration didn't break the previously # registered set assert metrics.metrics_sets == expected_metrics_sets assert metrics.load("ayy-lmao") is ayy_lmao_set metrics.unregister("ayy-lmao") assert_equal(metrics.metrics_sets, mappingproxy({})) msg = "no metrics set registered as 'ayy-lmao', options are: []" with pytest.raises(ValueError, match=re.escape(msg)): metrics.load("ayy-lmao") msg = "metrics set 'ayy-lmao' was not already registered" with pytest.raises(ValueError, match=msg): metrics.unregister("ayy-lmao")
def init_instance_fixtures(self): super(MetricsSetCoreTestCase, self).init_instance_fixtures() self.metrics_sets, self.register, self.unregister, self.load = ( _make_metrics_set_core()) # make sure this starts empty assert_equal(self.metrics_sets, mappingproxy({}))
def init_instance_fixtures(self): super(MetricsSetCoreTestCase, self).init_instance_fixtures() self.metrics_sets, self.register, self.unregister, self.load = ( _make_metrics_set_core() ) # make sure this starts empty assert_equal(self.metrics_sets, mappingproxy({}))
def __init__(self, interface): """ Parameters ---------- interface : type The abstract base class to manage """ self.interface = interface self._classes = {} self.classes = mappingproxy(self._classes)
def metrics(): MetricsCoreSet = namedtuple( "MetricsCoreSet", [ "metrics_sets", "register", "unregister", "load", ], ) metrics_set_core = MetricsCoreSet(*_make_metrics_set_core()) # make sure this starts empty assert metrics_set_core.metrics_sets == mappingproxy({}) yield metrics_set_core
def _make_bundle_core(): """Create a family of data bundle functions that read from the same bundle mapping. Returns ------- bundles : mappingproxy The mapping of bundles to bundle payloads. register : callable The function which registers new bundles in the ``bundles`` mapping. unregister : callable The function which deregisters bundles from the ``bundles`` mapping. ingest : callable The function which downloads and write data for a given data bundle. load : callable The function which loads the ingested bundles back into memory. clean : callable The function which cleans up data written with ``ingest``. """ _bundles = {} # the registered bundles # Expose _bundles through a proxy so that users cannot mutate this # accidentally. Users may go through `register` to update this which will # warn when trampling another bundle. bundles = mappingproxy(_bundles) @curry def register(name, f, calendar='NYSE', start_session=None, end_session=None, minutes_per_day=390, create_writers=True): """Register a data bundle ingest function. Parameters ---------- name : str The name of the bundle. f : callable The ingest function. This function will be passed: environ : mapping The environment this is being run with. asset_db_writer : AssetDBWriter The asset db writer to write into. minute_bar_writer : BcolzMinuteBarWriter The minute bar writer to write into. daily_bar_writer : BcolzDailyBarWriter The daily bar writer to write into. adjustment_writer : SQLiteAdjustmentWriter The adjustment db writer to write into. calendar : zipline.utils.calendars.TradingCalendar The trading calendar to ingest for. start_session : pd.Timestamp The first session of data to ingest. end_session : pd.Timestamp The last session of data to ingest. cache : DataFrameCache A mapping object to temporarily store dataframes. This should be used to cache intermediates in case the load fails. This will be automatically cleaned up after a successful load. show_progress : bool Show the progress for the current load where possible. calendar : zipline.utils.calendars.TradingCalendar or str, optional The trading calendar to align the data to, or the name of a trading calendar. This defaults to 'NYSE', in which case we use the NYSE calendar. start_session : pd.Timestamp, optional The first session for which we want data. If not provided, or if the date lies outside the range supported by the calendar, the first_session of the calendar is used. end_session : pd.Timestamp, optional The last session for which we want data. If not provided, or if the date lies outside the range supported by the calendar, the last_session of the calendar is used. minutes_per_day : int, optional The number of minutes in each normal trading day. create_writers : bool, optional Should the ingest machinery create the writers for the ingest function. This can be disabled as an optimization for cases where they are not needed, like the ``quantopian-quandl`` bundle. Notes ----- This function my be used as a decorator, for example: .. code-block:: python @register('quandl') def quandl_ingest_function(...): ... See Also -------- zipline.data.bundles.bundles """ if name in bundles: warnings.warn( 'Overwriting bundle with name %r' % name, stacklevel=3, ) if isinstance(calendar, str): calendar = get_calendar(calendar) # If the start and end sessions are not provided or lie outside # the bounds of the calendar being used, set them to the first # and last sessions of the calendar. if start_session is None or start_session < calendar.first_session: start_session = calendar.first_session if end_session is None or end_session > calendar.last_session: end_session = calendar.last_session _bundles[name] = _BundlePayload( calendar, start_session, end_session, minutes_per_day, f, create_writers, ) return f def unregister(name): """Unregister a bundle. Parameters ---------- name : str The name of the bundle to unregister. Raises ------ UnknownBundle Raised when no bundle has been registered with the given name. See Also -------- zipline.data.bundles.bundles """ try: del _bundles[name] except KeyError: raise UnknownBundle(name) def ingest(name, environ=os.environ, timestamp=None, show_progress=False): """Ingest data for a given bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. By default this is os.environ. timestamp : datetime, optional The timestamp to use for the load. By default this is the current time. show_progress : bool, optional Tell the ingest function to display the progress where possible. """ try: bundle = bundles[name] except KeyError: raise UnknownBundle(name) if timestamp is None: timestamp = pd.Timestamp.utcnow() timestamp = timestamp.tz_convert('utc').tz_localize(None) timestr = to_bundle_ingest_dirname(timestamp) cachepath = cache_path(name, environ=environ) pth.ensure_directory(pth.data_path([name, timestr], environ=environ)) pth.ensure_directory(cachepath) with dataframe_cache(cachepath, clean_on_failure=False) as cache, \ ExitStack() as stack: # we use `cleanup_on_failure=False` so that we don't purge the # cache directory if the load fails in the middle if bundle.create_writers: wd = stack.enter_context( working_dir(pth.data_path([], environ=environ))) daily_bars_path = wd.ensure_dir(*daily_equity_relative( name, timestr, environ=environ, )) daily_bar_writer = BcolzDailyBarWriter( daily_bars_path, bundle.calendar, bundle.start_session, bundle.end_session, ) # Do an empty write to ensure that the daily ctables exist # when we create the SQLiteAdjustmentWriter below. The # SQLiteAdjustmentWriter needs to open the daily ctables so # that it can compute the adjustment ratios for the dividends. daily_bar_writer.write(()) minute_bar_writer = BcolzMinuteBarWriter( wd.ensure_dir(*minute_equity_relative( name, timestr, environ=environ)), bundle.calendar, bundle.start_session, bundle.end_session, minutes_per_day=bundle.minutes_per_day, ) asset_db_writer = AssetDBWriter( wd.getpath(*asset_db_relative( name, timestr, environ=environ, ))) adjustment_db_writer = stack.enter_context( SQLiteAdjustmentWriter( wd.getpath(*adjustment_db_relative( name, timestr, environ=environ)), BcolzDailyBarReader(daily_bars_path), bundle.calendar.all_sessions, overwrite=True, )) else: daily_bar_writer = None minute_bar_writer = None asset_db_writer = None adjustment_db_writer = None bundle.ingest( environ, asset_db_writer, minute_bar_writer, daily_bar_writer, adjustment_db_writer, bundle.calendar, bundle.start_session, bundle.end_session, cache, show_progress, pth.data_path([name, timestr], environ=environ), ) def most_recent_data(bundle_name, timestamp, environ=None): """Get the path to the most recent data after ``date``for the given bundle. Parameters ---------- bundle_name : str The name of the bundle to lookup. timestamp : datetime The timestamp to begin searching on or before. environ : dict, optional An environment dict to forward to zipline_root. """ if bundle_name not in bundles: raise UnknownBundle(bundle_name) try: candidates = os.listdir( pth.data_path([bundle_name], environ=environ), ) return pth.data_path( [ bundle_name, max( filter(complement(pth.hidden), candidates), key=from_bundle_ingest_dirname, ) ], environ=environ, ) except (ValueError, OSError) as e: if getattr(e, 'errno', errno.ENOENT) != errno.ENOENT: raise raise ValueError( 'no data for bundle {bundle!r} on or before {timestamp}\n' 'maybe you need to run: $ zipline ingest -b {bundle}'.format( bundle=bundle_name, timestamp=timestamp, ), ) def load(name, environ=os.environ, timestamp=None): """Loads a previously ingested bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. Defaults of os.environ. timestamp : datetime, optional The timestamp of the data to lookup. Defaults to the current time. Returns ------- bundle_data : BundleData The raw data readers for this bundle. """ if timestamp is None: timestamp = pd.Timestamp.utcnow() timestr = most_recent_data(name, timestamp, environ=environ) return BundleData( asset_finder=AssetFinder( asset_db_path(name, timestr, environ=environ), ), equity_minute_bar_reader=BcolzMinuteBarReader( minute_equity_path(name, timestr, environ=environ), ), equity_daily_bar_reader=BcolzDailyBarReader( daily_equity_path(name, timestr, environ=environ), ), adjustment_reader=SQLiteAdjustmentReader( adjustment_db_path(name, timestr, environ=environ), ), ) @preprocess( before=optionally(ensure_timestamp), after=optionally(ensure_timestamp), ) def clean(name, before=None, after=None, keep_last=None, environ=os.environ): """Clean up data that was created with ``ingest`` or ``$ python -m zipline ingest`` Parameters ---------- name : str The name of the bundle to remove data for. before : datetime, optional Remove data ingested before this date. This argument is mutually exclusive with: keep_last after : datetime, optional Remove data ingested after this date. This argument is mutually exclusive with: keep_last keep_last : int, optional Remove all but the last ``keep_last`` ingestions. This argument is mutually exclusive with: before after environ : mapping, optional The environment variables. Defaults of os.environ. Returns ------- cleaned : set[str] The names of the runs that were removed. Raises ------ BadClean Raised when ``before`` and or ``after`` are passed with ``keep_last``. This is a subclass of ``ValueError``. """ try: all_runs = sorted( filter( complement(pth.hidden), os.listdir(pth.data_path([name], environ=environ)), ), key=from_bundle_ingest_dirname, ) except OSError as e: if e.errno != errno.ENOENT: raise raise UnknownBundle(name) if ((before is not None or after is not None) and keep_last is not None): raise BadClean(before, after, keep_last) if keep_last is None: def should_clean(name): dt = from_bundle_ingest_dirname(name) return ((before is not None and dt < before) or (after is not None and dt > after)) elif keep_last >= 0: last_n_dts = set(take(keep_last, reversed(all_runs))) def should_clean(name): return name not in last_n_dts else: raise BadClean(before, after, keep_last) cleaned = set() for run in all_runs: if should_clean(run): path = pth.data_path([name, run], environ=environ) shutil.rmtree(path) cleaned.add(path) return cleaned return BundleCore(bundles, register, unregister, ingest, load, clean)
class BcolzMinuteBarReader(MinuteBarReader): """ Reader for data written by BcolzMinuteBarWriter Parameters ---------- rootdir : string The root directory containing the metadata and asset bcolz directories. See Also -------- zipline.data.minute_bars.BcolzMinuteBarWriter """ FIELDS = ('open', 'high', 'low', 'close', 'volume') DEFAULT_MINUTELY_SID_CACHE_SIZES = { 'close': 3000, 'open': 1550, 'high': 1550, 'low': 1550, 'volume': 1550, } assert set(FIELDS) == set(DEFAULT_MINUTELY_SID_CACHE_SIZES), \ "FIELDS should match DEFAULT_MINUTELY_SID_CACHE_SIZES keys" # Wrap the defaults in proxy so that we don't accidentally mutate them in # place in the constructor. If a user wants to change the defaults, they # can do so by mutating DEFAULT_MINUTELY_SID_CACHE_SIZES. _default_proxy = mappingproxy(DEFAULT_MINUTELY_SID_CACHE_SIZES) def __init__(self, rootdir, sid_cache_sizes=_default_proxy): self._rootdir = rootdir metadata = self._get_metadata() self._start_session = metadata.start_session self._end_session = metadata.end_session self.calendar = metadata.calendar slicer = self.calendar.schedule.index.slice_indexer( self._start_session, self._end_session, ) self._schedule = self.calendar.schedule[slicer] self._market_opens = self._schedule.market_open self._market_open_values = self._market_opens.values.\ astype('datetime64[m]').astype(np.int64) self._market_closes = self._schedule.market_close self._market_close_values = self._market_closes.values.\ astype('datetime64[m]').astype(np.int64) self._default_ohlc_inverse = 1.0 / metadata.default_ohlc_ratio ohlc_ratios = metadata.ohlc_ratios_per_sid if ohlc_ratios: self._ohlc_inverses_per_sid = (valmap(lambda x: 1.0 / x, ohlc_ratios)) else: self._ohlc_inverses_per_sid = None self._minutes_per_day = metadata.minutes_per_day self._carrays = { field: LRU(sid_cache_sizes[field]) for field in self.FIELDS } self._last_get_value_dt_position = None self._last_get_value_dt_value = None # This is to avoid any bad data or other performance-killing situation # where there a consecutive streak of 0 (no volume) starting at an # asset's start date. # if asset 1 started on 2015-01-03 but its first trade is 2015-01-06 # 10:31 AM US/Eastern, this dict would store {1: 23675971}, # which is the minute epoch of that date. self._known_zero_volume_dict = {} def _get_metadata(self): return BcolzMinuteBarMetadata.read(self._rootdir) @property def trading_calendar(self): return self.calendar @lazyval def last_available_dt(self): _, close = self.calendar.open_and_close_for_session(self._end_session) return close @property def first_trading_day(self): return self._start_session def _ohlc_ratio_inverse_for_sid(self, sid): if self._ohlc_inverses_per_sid is not None: try: return self._ohlc_inverses_per_sid[sid] except KeyError: pass # If we can not get a sid-specific OHLC inverse for this sid, # fallback to the default. return self._default_ohlc_inverse def _minutes_to_exclude(self): """ Calculate the minutes which should be excluded when a window occurs on days which had an early close, i.e. days where the close based on the regular period of minutes per day and the market close do not match. Returns ------- List of DatetimeIndex representing the minutes to exclude because of early closes. """ market_opens = self._market_opens.values.astype('datetime64[m]') market_closes = self._market_closes.values.astype('datetime64[m]') minutes_per_day = (market_closes - market_opens).astype(np.int64) early_indices = np.where( minutes_per_day != self._minutes_per_day - 1)[0] early_opens = self._market_opens[early_indices] early_closes = self._market_closes[early_indices] minutes = [ (market_open, early_close) for market_open, early_close in zip(early_opens, early_closes) ] return minutes @lazyval def _minute_exclusion_tree(self): """ Build an interval tree keyed by the start and end of each range of positions should be dropped from windows. (These are the minutes between an early close and the minute which would be the close based on the regular period if there were no early close.) The value of each node is the same start and end position stored as a tuple. The data is stored as such in support of a fast answer to the question, does a given start and end position overlap any of the exclusion spans? Returns ------- IntervalTree containing nodes which represent the minutes to exclude because of early closes. """ itree = IntervalTree() for market_open, early_close in self._minutes_to_exclude(): start_pos = self._find_position_of_minute(early_close) + 1 end_pos = (self._find_position_of_minute(market_open) + self._minutes_per_day - 1) data = (start_pos, end_pos) itree[start_pos:end_pos + 1] = data return itree def _exclusion_indices_for_range(self, start_idx, end_idx): """ Returns ------- List of tuples of (start, stop) which represent the ranges of minutes which should be excluded when a market minute window is requested. """ itree = self._minute_exclusion_tree if itree.overlaps(start_idx, end_idx): ranges = [] intervals = itree[start_idx:end_idx] for interval in intervals: ranges.append(interval.data) return sorted(ranges) else: return None def _get_carray_path(self, sid, field): sid_subdir = _sid_subdir_path(sid) # carrays are subdirectories of the sid's rootdir return os.path.join(self._rootdir, sid_subdir, field) def _open_minute_file(self, field, sid): sid = int(sid) try: carray = self._carrays[field][sid] except KeyError: try: carray = self._carrays[field][sid] = bcolz.carray( rootdir=self._get_carray_path(sid, field), mode='r', ) except IOError: raise NoDataForSid('No minute data for sid {}.'.format(sid)) return carray def table_len(self, sid): """Returns the length of the underlying table for this sid.""" return len(self._open_minute_file('close', sid)) def get_sid_attr(self, sid, name): sid_subdir = _sid_subdir_path(sid) sid_path = os.path.join(self._rootdir, sid_subdir) attrs = bcolz.attrs.attrs(sid_path, 'r') try: return attrs[name] except KeyError: return None def get_value(self, sid, dt, field): """ Retrieve the pricing info for the given sid, dt, and field. Parameters ---------- sid : int Asset identifier. dt : datetime-like The datetime at which the trade occurred. field : string The type of pricing data to retrieve. ('open', 'high', 'low', 'close', 'volume') Returns ------- out : float|int The market data for the given sid, dt, and field coordinates. For OHLC: Returns a float if a trade occurred at the given dt. If no trade occurred, a np.nan is returned. For volume: Returns the integer value of the volume. (A volume of 0 signifies no trades for the given dt.) """ if self._last_get_value_dt_value == dt.value: minute_pos = self._last_get_value_dt_position else: try: minute_pos = self._find_position_of_minute(dt) except ValueError: raise NoDataOnDate() self._last_get_value_dt_value = dt.value self._last_get_value_dt_position = minute_pos try: value = self._open_minute_file(field, sid)[minute_pos] except IndexError: value = 0 if value == 0: if field == 'volume': return 0 else: return np.nan if field != 'volume': value *= self._ohlc_ratio_inverse_for_sid(sid) return value def get_last_traded_dt(self, asset, dt): minute_pos = self._find_last_traded_position(asset, dt) if minute_pos == -1: return pd.NaT return self._pos_to_minute(minute_pos) def _find_last_traded_position(self, asset, dt): volumes = self._open_minute_file('volume', asset) start_date_minute = asset.start_date.value / NANOS_IN_MINUTE dt_minute = dt.value / NANOS_IN_MINUTE try: # if we know of a dt before which this asset has no volume, # don't look before that dt earliest_dt_to_search = self._known_zero_volume_dict[asset.sid] except KeyError: earliest_dt_to_search = start_date_minute if dt_minute < earliest_dt_to_search: return -1 pos = find_last_traded_position_internal( self._market_open_values, self._market_close_values, dt_minute, earliest_dt_to_search, volumes, self._minutes_per_day, ) if pos == -1: # if we didn't find any volume before this dt, save it to avoid # work in the future. try: self._known_zero_volume_dict[asset.sid] = max( dt_minute, self._known_zero_volume_dict[asset.sid]) except KeyError: self._known_zero_volume_dict[asset.sid] = dt_minute return pos def _pos_to_minute(self, pos): minute_epoch = minute_value(self._market_open_values, pos, self._minutes_per_day) return pd.Timestamp(minute_epoch, tz='UTC', unit="m") def _find_position_of_minute(self, minute_dt): """ Internal method that returns the position of the given minute in the list of every trading minute since market open of the first trading day. Adjusts non market minutes to the last close. ex. this method would return 1 for 2002-01-02 9:32 AM Eastern, if 2002-01-02 is the first trading day of the dataset. Parameters ---------- minute_dt: pd.Timestamp The minute whose position should be calculated. Returns ------- int: The position of the given minute in the list of all trading minutes since market open on the first trading day. """ return find_position_of_minute( self._market_open_values, self._market_close_values, minute_dt.value / NANOS_IN_MINUTE, self._minutes_per_day, False, ) def load_raw_arrays(self, fields, start_dt, end_dt, sids): """ Parameters ---------- fields : list of str 'open', 'high', 'low', 'close', or 'volume' start_dt: Timestamp Beginning of the window range. end_dt: Timestamp End of the window range. sids : list of int The asset identifiers in the window. Returns ------- list of np.ndarray A list with an entry per field of ndarrays with shape (minutes in range, sids) with a dtype of float64, containing the values for the respective field over start and end dt range. """ start_idx = self._find_position_of_minute(start_dt) end_idx = self._find_position_of_minute(end_dt) num_minutes = (end_idx - start_idx + 1) results = [] indices_to_exclude = self._exclusion_indices_for_range( start_idx, end_idx) if indices_to_exclude is not None: for excl_start, excl_stop in indices_to_exclude: length = excl_stop - excl_start + 1 num_minutes -= length shape = num_minutes, len(sids) for field in fields: if field != 'volume': out = np.full(shape, np.nan) else: out = np.zeros(shape, dtype=np.uint32) for i, sid in enumerate(sids): carray = self._open_minute_file(field, sid) values = carray[start_idx:end_idx + 1] if indices_to_exclude is not None: for excl_start, excl_stop in indices_to_exclude[::-1]: excl_slice = np.s_[excl_start - start_idx:excl_stop - start_idx + 1] values = np.delete(values, excl_slice) where = values != 0 # first slice down to len(where) because we might not have # written data for all the minutes requested if field != 'volume': out[:len(where), i][where] = (values[where] * self._ohlc_ratio_inverse_for_sid(sid)) else: out[:len(where), i][where] = values[where] results.append(out) return results
def _make_bundle_core(): """Create a family of data bundle functions that read from the same bundle mapping. Returns ------- bundles : mappingproxy The mapping of bundles to bundle payloads. register : callable The function which registers new bundles in the ``bundles`` mapping. unregister : callable The function which deregisters bundles from the ``bundles`` mapping. ingest : callable The function which downloads and write data for a given data bundle. load : callable The function which loads the ingested bundles back into memory. clean : callable The function which cleans up data written with ``ingest``. """ _bundles = {} # the registered bundles # Expose _bundles through a proxy so that users cannot mutate this # accidentally. Users may go through `register` to update this which will # warn when trampling another bundle. bundles = mappingproxy(_bundles) @curry def register(name, f, calendar=trading_days, opens=open_and_closes['market_open'], closes=open_and_closes['market_close'], minutes_per_day=390, create_writers=True): """Register a data bundle ingest function. Parameters ---------- name : str The name of the bundle. f : callable The ingest function. This function will be passed: environ : mapping The environment this is being run with. asset_db_writer : AssetDBWriter The asset db writer to write into. minute_bar_writer : BcolzMinuteBarWriter The minute bar writer to write into. daily_bar_writer : BcolzDailyBarWriter The daily bar writer to write into. adjustment_writer : SQLiteAdjustmentWriter The adjustment db writer to write into. calendar : pd.DatetimeIndex The trading calendar to ingest for. cache : DataFrameCache A mapping object to temporarily store dataframes. This should be used to cache intermediates in case the load fails. This will be automatically cleaned up after a successful load. show_progress : bool Show the progress for the current load where possible. calendar : pd.DatetimeIndex, optional The exchange calendar to align the data to. This defaults to the NYSE calendar. market_open : pd.DatetimeIndex, optional The minute when the market opens each day. This defaults to the NYSE calendar. market_close : pd.DatetimeIndex, optional The minute when the market closes each day. This defaults to the NYSE calendar. minutes_per_day : int, optional The number of minutes in each normal trading day. create_writers : bool, optional Should the ingest machinery create the writers for the ingest function. This can be disabled as an optimization for cases where they are not needed, like the ``quantopian-quandl`` bundle. Notes ----- This function my be used as a decorator, for example: .. code-block:: python @register('quandl') def quandl_ingest_function(...): ... See Also -------- zipline.data.bundles.bundles """ if name in bundles: warnings.warn( 'Overwriting bundle with name %r' % name, stacklevel=3, ) _bundles[name] = _BundlePayload( calendar, opens, closes, minutes_per_day, f, create_writers, ) return f def unregister(name): """Unregister a bundle. Parameters ---------- name : str The name of the bundle to unregister. Raises ------ UnknownBundle Raised when no bundle has been registered with the given name. See Also -------- zipline.data.bundles.bundles """ try: del _bundles[name] except KeyError: raise UnknownBundle(name) def ingest(name, environ=os.environ, timestamp=None, show_progress=False): """Ingest data for a given bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. By default this is os.environ. timestamp : datetime, optional The timestamp to use for the load. By default this is the current time. show_progress : bool, optional Tell the ingest function to display the progress where possible. """ try: bundle = bundles[name] except KeyError: raise UnknownBundle(name) if timestamp is None: timestamp = pd.Timestamp.utcnow() timestamp = timestamp.tz_convert('utc').tz_localize(None) timestr = to_bundle_ingest_dirname(timestamp) cachepath = cache_path(name, environ=environ) pth.ensure_directory(pth.data_path([name, timestr], environ=environ)) pth.ensure_directory(cachepath) with dataframe_cache(cachepath, clean_on_failure=False) as cache, \ ExitStack() as stack: # we use `cleanup_on_failure=False` so that we don't purge the # cache directory if the load fails in the middle if bundle.create_writers: wd = stack.enter_context(working_dir( pth.data_path([], environ=environ)) ) daily_bars_path = wd.ensure_dir( *daily_equity_relative( name, timestr, environ=environ, ) ) daily_bar_writer = BcolzDailyBarWriter( daily_bars_path, nyse_cal, bundle.calendar[0], bundle.calendar[-1] ) # Do an empty write to ensure that the daily ctables exist # when we create the SQLiteAdjustmentWriter below. The # SQLiteAdjustmentWriter needs to open the daily ctables so # that it can compute the adjustment ratios for the dividends. daily_bar_writer.write(()) minute_bar_writer = BcolzMinuteBarWriter( bundle.calendar[0], wd.ensure_dir(*minute_equity_relative( name, timestr, environ=environ) ), bundle.opens, bundle.closes, minutes_per_day=bundle.minutes_per_day, ) asset_db_writer = AssetDBWriter( wd.getpath(*asset_db_relative( name, timestr, environ=environ, )) ) adjustment_db_writer = stack.enter_context( SQLiteAdjustmentWriter( wd.getpath(*adjustment_db_relative( name, timestr, environ=environ)), BcolzDailyBarReader(daily_bars_path), bundle.calendar, overwrite=True, ) ) else: daily_bar_writer = None minute_bar_writer = None asset_db_writer = None adjustment_db_writer = None bundle.ingest( environ, asset_db_writer, minute_bar_writer, daily_bar_writer, adjustment_db_writer, bundle.calendar, cache, show_progress, pth.data_path([name, timestr], environ=environ), ) def most_recent_data(bundle_name, timestamp, environ=None): """Get the path to the most recent data after ``date``for the given bundle. Parameters ---------- bundle_name : str The name of the bundle to lookup. timestamp : datetime The timestamp to begin searching on or before. environ : dict, optional An environment dict to forward to zipline_root. """ if bundle_name not in bundles: raise UnknownBundle(bundle_name) try: candidates = os.listdir( pth.data_path([bundle_name], environ=environ), ) return pth.data_path( [bundle_name, max( filter(complement(pth.hidden), candidates), key=from_bundle_ingest_dirname, )], environ=environ, ) except (ValueError, OSError) as e: if getattr(e, 'errno', errno.ENOENT) != errno.ENOENT: raise raise ValueError( 'no data for bundle {bundle!r} on or before {timestamp}\n' 'maybe you need to run: $ zipline ingest -b {bundle}'.format( bundle=bundle_name, timestamp=timestamp, ), ) def load(name, environ=os.environ, timestamp=None): """Loads a previously ingested bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. Defaults of os.environ. timestamp : datetime, optional The timestamp of the data to lookup. Defaults to the current time. Returns ------- bundle_data : BundleData The raw data readers for this bundle. """ if timestamp is None: timestamp = pd.Timestamp.utcnow() timestr = most_recent_data(name, timestamp, environ=environ) return BundleData( asset_finder=AssetFinder( asset_db_path(name, timestr, environ=environ), ), equity_minute_bar_reader=BcolzMinuteBarReader( minute_equity_path(name, timestr, environ=environ), ), equity_daily_bar_reader=BcolzDailyBarReader( daily_equity_path(name, timestr, environ=environ), ), adjustment_reader=SQLiteAdjustmentReader( adjustment_db_path(name, timestr, environ=environ), ), ) @preprocess( before=optionally(ensure_timestamp), after=optionally(ensure_timestamp), ) def clean(name, before=None, after=None, keep_last=None, environ=os.environ): """Clean up data that was created with ``ingest`` or ``$ python -m zipline ingest`` Parameters ---------- name : str The name of the bundle to remove data for. before : datetime, optional Remove data ingested before this date. This argument is mutually exclusive with: keep_last after : datetime, optional Remove data ingested after this date. This argument is mutually exclusive with: keep_last keep_last : int, optional Remove all but the last ``keep_last`` ingestions. This argument is mutually exclusive with: before after environ : mapping, optional The environment variables. Defaults of os.environ. Returns ------- cleaned : set[str] The names of the runs that were removed. Raises ------ BadClean Raised when ``before`` and or ``after`` are passed with ``keep_last``. This is a subclass of ``ValueError``. """ try: all_runs = sorted( filter( complement(pth.hidden), os.listdir(pth.data_path([name], environ=environ)), ), key=from_bundle_ingest_dirname, ) except OSError as e: if e.errno != errno.ENOENT: raise raise UnknownBundle(name) if ((before is not None or after is not None) and keep_last is not None): raise BadClean(before, after, keep_last) if keep_last is None: def should_clean(name): dt = from_bundle_ingest_dirname(name) return ( (before is not None and dt < before) or (after is not None and dt > after) ) else: last_n_dts = set(all_runs[-keep_last:]) def should_clean(name): return name not in last_n_dts cleaned = set() for run in all_runs: if should_clean(run): path = pth.data_path([name, run], environ=environ) shutil.rmtree(path) cleaned.add(path) return cleaned return BundleCore(bundles, register, unregister, ingest, load, clean)
def _make_metrics_set_core(): """Create a family of metrics sets functions that read from the same metrics set mapping. Returns ------- metrics_sets : mappingproxy The mapping of metrics sets to load functions. register : callable The function which registers new metrics sets in the ``metrics_sets`` mapping. unregister : callable The function which deregisters metrics sets from the ``metrics_sets`` mapping. load : callable The function which loads the ingested metrics sets back into memory. """ _metrics_sets = {} # Expose _metrics_sets through a proxy so that users cannot mutate this # accidentally. Users may go through `register` to update this which will # warn when trampling another metrics set. metrics_sets = mappingproxy(_metrics_sets) def register(name, function=None): """Register a new metrics set. Parameters ---------- name : str The name of the metrics set function : callable The callable which produces the metrics set. Notes ----- This may be used as a decorator if only ``name`` is passed. See Also -------- zipline.finance.metrics.get_metrics_set zipline.finance.metrics.unregister_metrics_set """ if function is None: # allow as decorator with just name. return partial(register, name) if name in _metrics_sets: raise ValueError("metrics set %r is already registered" % name) _metrics_sets[name] = function return function def unregister(name): """Unregister an existing metrics set. Parameters ---------- name : str The name of the metrics set See Also -------- zipline.finance.metrics.register_metrics_set """ try: del _metrics_sets[name] except KeyError: raise ValueError( "metrics set %r was not already registered" % name, ) def load(name): """Return an instance of the metrics set registered with the given name. Returns ------- metrics : set[Metric] A new instance of the metrics set. Raises ------ ValueError Raised when no metrics set is registered to ``name`` """ try: function = _metrics_sets[name] except KeyError: raise ValueError( "no metrics set registered as %r, options are: %r" % ( name, sorted(_metrics_sets), ), ) return function() return metrics_sets, register, unregister, load
def _make_bundle_core(): """Create a family of data bundle functions that read from the same bundle mapping. Returns ------- bundles : mappingproxy The mapping of bundles to bundle payloads. register : callable The function which registers new bundles in the ``bundles`` mapping. unregister : callable The function which deregisters bundles from the ``bundles`` mapping. ingest : callable The function which downloads and write data for a given data bundle. load : callable The function which loads the ingested bundles back into memory. """ _bundles = {} # the registered bundles # Expose _bundles through a proxy so that users cannot mutate this # accidentally. Users may go through `register` to update this which will # warn when trampling another bundle. bundles = mappingproxy(_bundles) @curry def register(name, f, calendar_name='NYSE', start_session=None, end_session=None, minutes_per_day=390, create_writers=True): """Register a data bundle ingest function. Parameters ---------- name : str The name of the bundle. f : callable The ingest function. This function will be passed: environ : mapping The environment this is being run with. asset_db_writer : AssetDBWriter The asset db writer to write into. minute_bar_writer : BcolzMinuteBarWriter The minute bar writer to write into. daily_bar_writer : BcolzDailyBarWriter The daily bar writer to write into. adjustment_writer : SQLiteAdjustmentWriter The adjustment db writer to write into. calendar : trading_calendars.TradingCalendar The trading calendar to ingest for. start_session : pd.Timestamp The first session of data to ingest. end_session : pd.Timestamp The last session of data to ingest. cache : DataFrameCache A mapping object to temporarily store dataframes. This should be used to cache intermediates in case the load fails. This will be automatically cleaned up after a successful load. calendar_name : str, optional The name of a calendar used to align bundle data. Default is 'NYSE'. start_session : pd.Timestamp, optional The first session for which we want data. If not provided, or if the date lies outside the range supported by the calendar, the first_session of the calendar is used. end_session : pd.Timestamp, optional The last session for which we want data. If not provided, or if the date lies outside the range supported by the calendar, the last_session of the calendar is used. minutes_per_day : int, optional The number of minutes in each normal trading day. create_writers : bool, optional Should the ingest machinery create the writers for the ingest function. This can be disabled as an optimization for cases where they are not needed. Notes ----- This function my be used as a decorator, for example: .. code-block:: python @register('quandl') def quandl_ingest_function(...): ... See Also -------- zipline.data.bundles.bundles """ if name in bundles: warnings.warn( 'Overwriting bundle with name %r' % name, stacklevel=3, ) # NOTE: We don't eagerly compute calendar values here because # `register` is called at module scope in zipline, and creating a # calendar currently takes between 0.5 and 1 seconds, which causes a # noticeable delay on the zipline CLI. _bundles[name] = RegisteredBundle( calendar_name=calendar_name, start_session=start_session, end_session=end_session, minutes_per_day=minutes_per_day, ingest=f, create_writers=create_writers, ) return f def unregister(name): """Unregister a bundle. Parameters ---------- name : str The name of the bundle to unregister. Raises ------ UnknownBundle Raised when no bundle has been registered with the given name. See Also -------- zipline.data.bundles.bundles """ try: del _bundles[name] except KeyError: raise UnknownBundle(name) def ingest(name, environ=os.environ, timestamp=None): """Ingest data for a given bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. By default this is os.environ. timestamp : datetime, optional The timestamp to use for the load. By default this is the current time. """ try: bundle = bundles[name] except KeyError: raise UnknownBundle(name) calendar = get_calendar(bundle.calendar_name) start_session = bundle.start_session end_session = bundle.end_session if start_session is None or start_session < calendar.first_session: start_session = calendar.first_session if end_session is None or end_session > calendar.last_session: end_session = calendar.last_session if timestamp is None: timestamp = pd.Timestamp.utcnow() timestamp = timestamp.tz_convert('utc').tz_localize(None) timestr = to_bundle_ingest_dirname(timestamp) cachepath = cache_path(name, environ=environ) pth.ensure_directory(pth.data_path([name, timestr], environ=environ)) pth.ensure_directory(cachepath) with dataframe_cache(cachepath, clean_on_failure=False) as cache, \ ExitStack() as stack: # we use `cleanup_on_failure=False` so that we don't purge the # cache directory if the load fails in the middle if bundle.create_writers: wd = stack.enter_context( working_dir(pth.data_path([], environ=environ))) daily_bars_path = wd.ensure_dir( *daily_equity_relative(name, timestr)) daily_bar_writer = BcolzDailyBarWriter( daily_bars_path, calendar, start_session, end_session, ) # Do an empty write to ensure that the daily ctables exist # when we create the SQLiteAdjustmentWriter below. The # SQLiteAdjustmentWriter needs to open the daily ctables so # that it can compute the adjustment ratios for the dividends. daily_bar_writer.write(()) minute_bar_writer = BcolzMinuteBarWriter( wd.ensure_dir(*minute_equity_relative(name, timestr)), calendar, start_session, end_session, minutes_per_day=bundle.minutes_per_day, ) assets_db_path = wd.getpath(*asset_db_relative(name, timestr)) asset_db_writer = AssetDBWriter(assets_db_path) adjustment_db_writer = stack.enter_context( SQLiteAdjustmentWriter( wd.getpath(*adjustment_db_relative(name, timestr)), BcolzDailyBarReader(daily_bars_path), overwrite=True, )) else: daily_bar_writer = None minute_bar_writer = None asset_db_writer = None adjustment_db_writer = None bundle.ingest( environ, asset_db_writer, minute_bar_writer, daily_bar_writer, adjustment_db_writer, calendar, start_session, end_session, cache, pth.data_path([name, timestr], environ=environ), ) def most_recent_data(bundle_name, timestamp, environ=None): """Get the path to the most recent data after ``date``for the given bundle. Parameters ---------- bundle_name : str The name of the bundle to lookup. timestamp : datetime The timestamp to begin searching on or before. environ : dict, optional An environment dict to forward to zipline_root. """ if bundle_name not in bundles: raise UnknownBundle(bundle_name) try: candidates = os.listdir( pth.data_path([bundle_name], environ=environ), ) return pth.data_path( [ bundle_name, max( filter(complement(pth.hidden), candidates), key=from_bundle_ingest_dirname, ) ], environ=environ, ) except (ValueError, OSError) as e: if getattr(e, 'errno', errno.ENOENT) != errno.ENOENT: raise raise ValueError( 'no data for bundle {bundle!r} on or before {timestamp}\n' 'maybe you need to run: $ zipline ingest -b {bundle}'.format( bundle=bundle_name, timestamp=timestamp, ), ) def load(name, environ=os.environ, timestamp=None, daily_bar_reader_kwargs={}, minute_bar_reader_kwargs={}): """Loads a previously ingested bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. Defaults of os.environ. timestamp : datetime, optional The timestamp of the data to lookup. Defaults to the current time. Returns ------- bundle_data : BundleData The raw data readers for this bundle. """ if timestamp is None: timestamp = pd.Timestamp.utcnow() timestr = most_recent_data(name, timestamp, environ=environ) return BundleData( asset_finder=AssetFinder( asset_db_path(name, timestr, environ=environ), ), equity_minute_bar_reader=BcolzMinuteBarReader( minute_equity_path(name, timestr, environ=environ), **minute_bar_reader_kwargs), equity_daily_bar_reader=BcolzDailyBarReader( daily_equity_path(name, timestr, environ=environ), **daily_bar_reader_kwargs), adjustment_reader=SQLiteAdjustmentReader( adjustment_db_path(name, timestr, environ=environ), ), ) return BundleCore(bundles, register, unregister, ingest, load)