def test_bad_input(self): data = arange(100).reshape(self.ndates, self.nsids) baseline = DataFrame(data, index=self.dates, columns=self.sids) loader = DataFrameLoader( USEquityPricing.close, baseline, ) with self.assertRaises(ValueError): # Wrong column. loader.load_adjusted_array( US_EQUITIES, [USEquityPricing.open], self.dates, self.sids, self.mask, ) with self.assertRaises(ValueError): # Too many columns. loader.load_adjusted_array( US_EQUITIES, [USEquityPricing.open, USEquityPricing.close], self.dates, self.sids, self.mask, )
def _load_events(self, name_map, indexer, columns, dates, sids, mask): def to_frame(array): return pd.DataFrame(array, index=dates, columns=sids) assert indexer.shape == (len(dates), len(sids)) out = {} for c in columns: # Array holding the value for column `c` for every event we have. col_array = self.events[name_map[c]] if not len(col_array): # We don't have **any** events, so return col.missing_value # every day for every sid. We have to special case empty events # because in normal branch we depend on being able to index # with -1 for missing values, which fails if there are no # events at all. raw = np.full((len(dates), len(sids)), c.missing_value, dtype=c.dtype) else: # Slot event values into sid/date locations using `indexer`. # This produces a 2D array of the same shape as `indexer`, # which must be (len(dates), len(sids))`. raw = col_array[indexer] # indexer will be -1 for locations where we don't have a known # value. Overwrite those locations with c.missing_value. raw[indexer < 0] = c.missing_value # Delegate the actual array formatting logic to a DataFrameLoader. loader = DataFrameLoader(c, to_frame(raw), adjustments=None) out[c] = loader.load_adjusted_array([c], dates, sids, mask)[c] return out
def load_adjusted_array(self, domain, columns, dates, sids, mask): out = {} for column in columns: fundamentals_df = self.fundamentals_reader.read( column.name, dates, sids, ) df_loader = DataFrameLoader(column, fundamentals_df) out.update(df_loader.load_adjusted_array(domain, [column,], dates, sids, mask)) return out
def _load_events(self, name_map, indexer, columns, dates, sids, mask): def to_frame(array): return pd.DataFrame(array, index=dates, columns=sids) out = {} for c in columns: raw = self.events[name_map[c]][indexer] # indexer will be -1 for locations where we don't have a known # value. raw[indexer < 0] = c.missing_value # Delegate the actual array formatting logic to a DataFrameLoader. loader = DataFrameLoader(c, to_frame(raw), adjustments=None) out[c] = loader.load_adjusted_array([c], dates, sids, mask)[c] return out
def test_baseline(self): data = arange(100).reshape(self.ndates, self.nsids) baseline = DataFrame(data, index=self.dates, columns=self.sids) loader = DataFrameLoader(USEquityPricing.close, baseline) dates_slice = slice(None, 10, None) sids_slice = slice(1, 3, None) [adj_array] = loader.load_adjusted_array( [USEquityPricing.close], self.dates[dates_slice], self.sids[sids_slice], self.mask[dates_slice, sids_slice], ) for idx, window in enumerate(adj_array.traverse(window_length=3)): expected = baseline.values[dates_slice, sids_slice][idx:idx + 3] assert_array_equal(window, expected)
def test_baseline(self): data = arange(100).reshape(self.ndates, self.nsids) baseline = DataFrame(data, index=self.dates, columns=self.sids) loader = DataFrameLoader(USEquityPricing.close, baseline) dates_slice = slice(None, 10, None) sids_slice = slice(1, 3, None) [adj_array] = loader.load_adjusted_array( [USEquityPricing.close], self.dates[dates_slice], self.sids[sids_slice], self.mask[dates_slice, sids_slice], ).values() for idx, window in enumerate(adj_array.traverse(window_length=3)): expected = baseline.values[dates_slice, sids_slice][idx:idx + 3] assert_array_equal(window, expected)
def _load_events(self, name_map, indexer, domain, columns, dates, sids, mask): def to_frame(array): return pd.DataFrame(array, index=dates, columns=sids) assert indexer.shape == (len(dates), len(sids)) out = {} for c in columns: # Array holding the value for column `c` for every event we have. col_array = self.events[name_map[c]] if not len(col_array): # We don't have **any** events, so return col.missing_value # every day for every sid. We have to special case empty events # because in normal branch we depend on being able to index # with -1 for missing values, which fails if there are no # events at all. raw = np.full( (len(dates), len(sids)), c.missing_value, dtype=c.dtype, ) else: # Slot event values into sid/date locations using `indexer`. # This produces a 2D array of the same shape as `indexer`, # which must be (len(dates), len(sids))`. raw = col_array[indexer] # indexer will be -1 for locations where we don't have a known # value. Overwrite those locations with c.missing_value. raw[indexer < 0] = c.missing_value # Delegate the actual array formatting logic to a DataFrameLoader. loader = DataFrameLoader(c, to_frame(raw), adjustments=None) out[c] = loader.load_adjusted_array( domain, [c], dates, sids, mask, )[c] return out
def test_adjustments(self): data = arange(100).reshape(self.ndates, self.nsids) baseline = DataFrame(data, index=self.dates, columns=self.sids) # Use the dates from index 10 on and sids 1-3. dates_slice = slice(10, None, None) sids_slice = slice(1, 4, None) # Adjustments that should actually affect the output. relevant_adjustments = [ { 'sid': 1, 'start_date': None, 'end_date': self.dates[15], 'apply_date': self.dates[16], 'value': 0.5, 'kind': MULTIPLY, }, { 'sid': 2, 'start_date': self.dates[5], 'end_date': self.dates[15], 'apply_date': self.dates[16], 'value': 1.0, 'kind': ADD, }, { 'sid': 2, 'start_date': self.dates[15], 'end_date': self.dates[16], 'apply_date': self.dates[17], 'value': 1.0, 'kind': ADD, }, { 'sid': 3, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': 99.0, 'kind': OVERWRITE, }, ] # These adjustments shouldn't affect the output. irrelevant_adjustments = [ { # Sid Not Requested 'sid': 0, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': -9999.0, 'kind': OVERWRITE, }, { # Sid Unknown 'sid': 9999, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': -9999.0, 'kind': OVERWRITE, }, { # Date Not Requested 'sid': 2, 'start_date': self.dates[1], 'end_date': self.dates[2], 'apply_date': self.dates[3], 'value': -9999.0, 'kind': OVERWRITE, }, { # Date Before Known Data 'sid': 2, 'start_date': self.dates[0] - (2 * trading_day), 'end_date': self.dates[0] - trading_day, 'apply_date': self.dates[0] - trading_day, 'value': -9999.0, 'kind': OVERWRITE, }, { # Date After Known Data 'sid': 2, 'start_date': self.dates[-1] + trading_day, 'end_date': self.dates[-1] + (2 * trading_day), 'apply_date': self.dates[-1] + (3 * trading_day), 'value': -9999.0, 'kind': OVERWRITE, }, ] adjustments = DataFrame(relevant_adjustments + irrelevant_adjustments) loader = DataFrameLoader( USEquityPricing.close, baseline, adjustments=adjustments, ) expected_baseline = baseline.iloc[dates_slice, sids_slice] formatted_adjustments = loader.format_adjustments( self.dates[dates_slice], self.sids[sids_slice], ) expected_formatted_adjustments = { 6: [ Float64Multiply( first_row=0, last_row=5, first_col=0, last_col=0, value=0.5, ), Float64Add( first_row=0, last_row=5, first_col=1, last_col=1, value=1.0, ), ], 7: [ Float64Add( first_row=5, last_row=6, first_col=1, last_col=1, value=1.0, ), ], 8: [ Float64Overwrite( first_row=6, last_row=7, first_col=2, last_col=2, value=99.0, ) ], } self.assertEqual(formatted_adjustments, expected_formatted_adjustments) mask = self.mask[dates_slice, sids_slice] with patch('zipline.pipeline.loaders.frame.adjusted_array') as m: loader.load_adjusted_array( columns=[USEquityPricing.close], dates=self.dates[dates_slice], assets=self.sids[sids_slice], mask=mask, ) self.assertEqual(m.call_count, 1) args, kwargs = m.call_args assert_array_equal(kwargs['data'], expected_baseline.values) assert_array_equal(kwargs['mask'], mask) self.assertEqual(kwargs['adjustments'], expected_formatted_adjustments)
def test_adjustments(self): data = arange(100).reshape(self.ndates, self.nsids) baseline = DataFrame(data, index=self.dates, columns=self.sids) # Use the dates from index 10 on and sids 1-3. dates_slice = slice(10, None, None) sids_slice = slice(1, 4, None) # Adjustments that should actually affect the output. relevant_adjustments = [ { 'sid': 1, 'start_date': None, 'end_date': self.dates[15], 'apply_date': self.dates[16], 'value': 0.5, 'kind': MULTIPLY, }, { 'sid': 2, 'start_date': self.dates[5], 'end_date': self.dates[15], 'apply_date': self.dates[16], 'value': 1.0, 'kind': ADD, }, { 'sid': 2, 'start_date': self.dates[15], 'end_date': self.dates[16], 'apply_date': self.dates[17], 'value': 1.0, 'kind': ADD, }, { 'sid': 3, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': 99.0, 'kind': OVERWRITE, }, ] # These adjustments shouldn't affect the output. irrelevant_adjustments = [ { # Sid Not Requested 'sid': 0, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': -9999.0, 'kind': OVERWRITE, }, { # Sid Unknown 'sid': 9999, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': -9999.0, 'kind': OVERWRITE, }, { # Date Not Requested 'sid': 2, 'start_date': self.dates[1], 'end_date': self.dates[2], 'apply_date': self.dates[3], 'value': -9999.0, 'kind': OVERWRITE, }, { # Date Before Known Data 'sid': 2, 'start_date': self.dates[0] - (2 * trading_day), 'end_date': self.dates[0] - trading_day, 'apply_date': self.dates[0] - trading_day, 'value': -9999.0, 'kind': OVERWRITE, }, { # Date After Known Data 'sid': 2, 'start_date': self.dates[-1] + trading_day, 'end_date': self.dates[-1] + (2 * trading_day), 'apply_date': self.dates[-1] + (3 * trading_day), 'value': -9999.0, 'kind': OVERWRITE, }, ] adjustments = DataFrame(relevant_adjustments + irrelevant_adjustments) loader = DataFrameLoader( USEquityPricing.close, baseline, adjustments=adjustments, ) expected_baseline = baseline.iloc[dates_slice, sids_slice] formatted_adjustments = loader.format_adjustments( self.dates[dates_slice], self.sids[sids_slice], ) expected_formatted_adjustments = { 6: [ Float64Multiply( first_row=0, last_row=5, first_col=0, last_col=0, value=0.5, ), Float64Add( first_row=0, last_row=5, first_col=1, last_col=1, value=1.0, ), ], 7: [ Float64Add( first_row=5, last_row=6, first_col=1, last_col=1, value=1.0, ), ], 8: [ Float64Overwrite( first_row=6, last_row=7, first_col=2, last_col=2, value=99.0, ) ], } self.assertEqual(formatted_adjustments, expected_formatted_adjustments) mask = self.mask[dates_slice, sids_slice] with patch('zipline.pipeline.loaders.frame.AdjustedArray') as m: loader.load_adjusted_array( columns=[USEquityPricing.close], dates=self.dates[dates_slice], assets=self.sids[sids_slice], mask=mask, ) self.assertEqual(m.call_count, 1) args, kwargs = m.call_args assert_array_equal(kwargs['data'], expected_baseline.values) assert_array_equal(kwargs['mask'], mask) self.assertEqual(kwargs['adjustments'], expected_formatted_adjustments)
def test_adjustments(self): data = np.arange(100).reshape(self.ndates, self.nsids) baseline = pd.DataFrame(data, index=self.dates, columns=self.sids) # Use the dates from index 10 on and sids 1-3. dates_slice = slice(10, None, None) sids_slice = slice(1, 4, None) # Adjustments that should actually affect the output. relevant_adjustments = [ { "sid": 1, "start_date": None, "end_date": self.dates[15], "apply_date": self.dates[16], "value": 0.5, "kind": MULTIPLY, }, { "sid": 2, "start_date": self.dates[5], "end_date": self.dates[15], "apply_date": self.dates[16], "value": 1.0, "kind": ADD, }, { "sid": 2, "start_date": self.dates[15], "end_date": self.dates[16], "apply_date": self.dates[17], "value": 1.0, "kind": ADD, }, { "sid": 3, "start_date": self.dates[16], "end_date": self.dates[17], "apply_date": self.dates[18], "value": 99.0, "kind": OVERWRITE, }, ] # These adjustments shouldn't affect the output. irrelevant_adjustments = [ { # Sid Not Requested "sid": 0, "start_date": self.dates[16], "end_date": self.dates[17], "apply_date": self.dates[18], "value": -9999.0, "kind": OVERWRITE, }, { # Sid Unknown "sid": 9999, "start_date": self.dates[16], "end_date": self.dates[17], "apply_date": self.dates[18], "value": -9999.0, "kind": OVERWRITE, }, { # Date Not Requested "sid": 2, "start_date": self.dates[1], "end_date": self.dates[2], "apply_date": self.dates[3], "value": -9999.0, "kind": OVERWRITE, }, { # Date Before Known Data "sid": 2, "start_date": self.dates[0] - (2 * self.trading_day), "end_date": self.dates[0] - self.trading_day, "apply_date": self.dates[0] - self.trading_day, "value": -9999.0, "kind": OVERWRITE, }, { # Date After Known Data "sid": 2, "start_date": self.dates[-1] + self.trading_day, "end_date": self.dates[-1] + (2 * self.trading_day), "apply_date": self.dates[-1] + (3 * self.trading_day), "value": -9999.0, "kind": OVERWRITE, }, ] adjustments = pd.DataFrame(relevant_adjustments + irrelevant_adjustments) loader = DataFrameLoader( USEquityPricing.close, baseline, adjustments=adjustments, ) expected_baseline = baseline.iloc[dates_slice, sids_slice] formatted_adjustments = loader.format_adjustments( self.dates[dates_slice], self.sids[sids_slice], ) expected_formatted_adjustments = { 6: [ Float64Multiply( first_row=0, last_row=5, first_col=0, last_col=0, value=0.5, ), Float64Add( first_row=0, last_row=5, first_col=1, last_col=1, value=1.0, ), ], 7: [ Float64Add( first_row=5, last_row=6, first_col=1, last_col=1, value=1.0, ), ], 8: [ Float64Overwrite( first_row=6, last_row=7, first_col=2, last_col=2, value=99.0, ) ], } assert formatted_adjustments == expected_formatted_adjustments mask = self.mask[dates_slice, sids_slice] with patch("zipline.pipeline.loaders.frame.AdjustedArray") as m: loader.load_adjusted_array( US_EQUITIES, columns=[USEquityPricing.close], dates=self.dates[dates_slice], sids=self.sids[sids_slice], mask=mask, ) assert m.call_count == 1 args, kwargs = m.call_args assert_array_equal(kwargs["data"], expected_baseline.values) assert kwargs["adjustments"] == expected_formatted_adjustments