class DataFrameLoader(PipelineLoader): """ A PipelineLoader that reads its input from DataFrames. Mostly useful for testing, but can also be used for real work if your data fits in memory. Parameters ---------- column : zipline.pipeline.data.BoundColumn The column whose data is loadable by this loader. baseline : pandas.DataFrame A DataFrame with index of type DatetimeIndex and columns of type Int64Index. Dates should be labelled with the first date on which a value would be **available** to an algorithm. This means that OHLCV data should generally be shifted back by a trading day before being supplied to this class. adjustments : pandas.DataFrame, default=None A DataFrame with the following columns: sid : int value : any kind : int (zipline.pipeline.loaders.frame.ADJUSTMENT_TYPES) start_date : datetime64 (can be NaT) end_date : datetime64 (must be set) apply_date : datetime64 (must be set) The default of None is interpreted as "no adjustments to the baseline". """ def __init__(self, column, baseline, adjustments=None): self.column = column self.baseline = baseline.values.astype(self.column.dtype) self.dates = baseline.index self.assets = baseline.columns if adjustments is None: adjustments = DataFrame( index=DatetimeIndex([]), columns=ADJUSTMENT_COLUMNS, ) else: # Ensure that columns are in the correct order. adjustments = adjustments.reindex_axis(ADJUSTMENT_COLUMNS, axis=1) adjustments.sort_values(['apply_date', 'sid'], inplace=True) self.adjustments = adjustments self.adjustment_apply_dates = DatetimeIndex(adjustments.apply_date) self.adjustment_end_dates = DatetimeIndex(adjustments.end_date) self.adjustment_sids = Int64Index(adjustments.sid) def format_adjustments(self, dates, assets): """ Build a dict of Adjustment objects in the format expected by AdjustedArray. Returns a dict of the form: { # Integer index into `dates` for the date on which we should # apply the list of adjustments. 1 : [ Float64Multiply(first_row=2, last_row=4, col=3, value=0.5), Float64Overwrite(first_row=3, last_row=5, col=1, value=2.0), ... ], ... } """ make_adjustment = partial(make_adjustment_from_labels, dates, assets) min_date, max_date = dates[[0, -1]] # TODO: Consider porting this to Cython. if len(self.adjustments) == 0: return {} # Mask for adjustments whose apply_dates are in the requested window of # dates. date_bounds = self.adjustment_apply_dates.slice_indexer( min_date, max_date, ) dates_filter = zeros(len(self.adjustments), dtype='bool') dates_filter[date_bounds] = True # Ignore adjustments whose apply_date is in range, but whose end_date # is out of range. dates_filter &= (self.adjustment_end_dates >= min_date) # Mask for adjustments whose sids are in the requested assets. sids_filter = self.adjustment_sids.isin(assets.values) adjustments_to_use = self.adjustments.loc[ dates_filter & sids_filter ].set_index('apply_date') # For each apply_date on which we have an adjustment, compute # the integer index of that adjustment's apply_date in `dates`. # Then build a list of Adjustment objects for that apply_date. # This logic relies on the sorting applied on the previous line. out = {} previous_apply_date = object() for row in adjustments_to_use.itertuples(): # This expansion depends on the ordering of the DataFrame columns, # defined above. apply_date, sid, value, kind, start_date, end_date = row if apply_date != previous_apply_date: # Get the next apply date if no exact match. row_loc = dates.get_loc(apply_date, method='bfill') current_date_adjustments = out[row_loc] = [] previous_apply_date = apply_date # Look up the approprate Adjustment constructor based on the value # of `kind`. current_date_adjustments.append( make_adjustment(start_date, end_date, sid, kind, value) ) return out def load_adjusted_array(self, columns, dates, assets, mask): """ Load data from our stored baseline. """ column = self.column if len(columns) != 1: raise ValueError( "Can't load multiple columns with DataFrameLoader" ) elif columns[0] != column: raise ValueError("Can't load unknown column %s" % columns[0]) date_indexer = self.dates.get_indexer(dates) assets_indexer = self.assets.get_indexer(assets) # Boolean arrays with True on matched entries good_dates = (date_indexer != -1) good_assets = (assets_indexer != -1) return { column: AdjustedArray( # Pull out requested columns/rows from our baseline data. data=self.baseline[ix_(date_indexer, assets_indexer)], # Mask out requested columns/rows that didnt match. mask=(good_assets & as_column(good_dates)) & mask, adjustments=self.format_adjustments(dates, assets), missing_value=column.missing_value, ), }
class DataFrameLoader(PipelineLoader): """ A PipelineLoader that reads its input from DataFrames. Mostly useful for testing, but can also be used for real work if your data fits in memory. Parameters ---------- column : catalyst.pipeline.data.BoundColumn The column whose data is loadable by this loader. baseline : pandas.DataFrame A DataFrame with index of type DatetimeIndex and columns of type Int64Index. Dates should be labelled with the first date on which a value would be **available** to an algorithm. This means that OHLCV data should generally be shifted back by a trading day before being supplied to this class. adjustments : pandas.DataFrame, default=None A DataFrame with the following columns: sid : int value : any kind : int (catalyst.pipeline.loaders.frame.ADJUSTMENT_TYPES) start_date : datetime64 (can be NaT) end_date : datetime64 (must be set) apply_date : datetime64 (must be set) The default of None is interpreted as "no adjustments to the baseline". """ def __init__(self, column, baseline, adjustments=None): self.column = column self.baseline = baseline.values.astype(self.column.dtype) self.dates = baseline.index self.assets = baseline.columns self._columns = self.assets if adjustments is None: adjustments = DataFrame( index=DatetimeIndex([]), columns=ADJUSTMENT_COLUMNS, ) else: # Ensure that columns are in the correct order. adjustments = adjustments.reindex_axis(ADJUSTMENT_COLUMNS, axis=1) adjustments.sort_values(['apply_date', 'sid'], inplace=True) self.adjustments = adjustments self.adjustment_apply_dates = DatetimeIndex(adjustments.apply_date) self.adjustment_end_dates = DatetimeIndex(adjustments.end_date) self.adjustment_sids = Int64Index(adjustments.sid) def format_adjustments(self, dates, assets): """ Build a dict of Adjustment objects in the format expected by AdjustedArray. Returns a dict of the form: { # Integer index into `dates` for the date on which we should # apply the list of adjustments. 1 : [ Float64Multiply(first_row=2, last_row=4, col=3, value=0.5), Float64Overwrite(first_row=3, last_row=5, col=1, value=2.0), ... ], ... } """ make_adjustment = partial(make_adjustment_from_labels, dates, assets) min_date, max_date = dates[[0, -1]] # TODO: Consider porting this to Cython. if len(self.adjustments) == 0: return {} # Mask for adjustments whose apply_dates are in the requested window of # dates. date_bounds = self.adjustment_apply_dates.slice_indexer( min_date, max_date, ) dates_filter = zeros(len(self.adjustments), dtype='bool') dates_filter[date_bounds] = True # Ignore adjustments whose apply_date is in range, but whose end_date # is out of range. dates_filter &= (self.adjustment_end_dates >= min_date) # Mask for adjustments whose sids are in the requested assets. sids_filter = self.adjustment_sids.isin(assets.values) adjustments_to_use = self.adjustments.loc[ dates_filter & sids_filter].set_index('apply_date') # For each apply_date on which we have an adjustment, compute # the integer index of that adjustment's apply_date in `dates`. # Then build a list of Adjustment objects for that apply_date. # This logic relies on the sorting applied on the previous line. out = {} previous_apply_date = object() for row in adjustments_to_use.itertuples(): # This expansion depends on the ordering of the DataFrame columns, # defined above. apply_date, sid, value, kind, start_date, end_date = row if apply_date != previous_apply_date: # Get the next apply date if no exact match. row_loc = dates.get_loc(apply_date, method='bfill') current_date_adjustments = out[row_loc] = [] previous_apply_date = apply_date # Look up the approprate Adjustment constructor based on the value # of `kind`. current_date_adjustments.append( make_adjustment(start_date, end_date, sid, kind, value)) return out def load_adjusted_array(self, columns, dates, assets, mask): """ Load data from our stored baseline. """ column = self.column if len(columns) != 1: raise ValueError( "Can't load multiple columns with DataFrameLoader") elif columns[0] != column: raise ValueError("Can't load unknown column %s" % columns[0]) date_indexer = self.dates.get_indexer(dates) assets_indexer = self.assets.get_indexer(assets) # Boolean arrays with True on matched entries good_dates = (date_indexer != -1) good_assets = (assets_indexer != -1) return { column: AdjustedArray( # Pull out requested columns/rows from our baseline data. data=self.baseline[ix_(date_indexer, assets_indexer)], # Mask out requested columns/rows that didnt match. mask=(good_assets & as_column(good_dates)) & mask, adjustments=self.format_adjustments(dates, assets), missing_value=column.missing_value, ), } @property def columns(self): return self._columns