def test_bad_input(self): msg = "Mask shape \(2, 3\) != data shape \(5, 5\)" data = arange(25).reshape(5, 5) bad_mask = array([[0, 1, 1], [0, 0, 1]], dtype=bool) with self.assertRaisesRegexp(ValueError, msg): adjusted_array(data, bad_mask, {})
def load_adjusted_array(self, columns, dates, assets, mask): # load_adjusted_array is called with dates on which the user's algo # will be shown data, which means we need to return the data that would # be known at the start of each date. We assume that the latest data # known on day N is the data from day (N - 1), so we shift all query # dates back by a day. start_date, end_date = _shift_dates( self._calendar, dates[0], dates[-1], shift=1, ) raw_arrays = self.raw_price_loader.load_raw_arrays( columns, start_date, end_date, assets, ) adjustments = self.adjustments_loader.load_adjustments( columns, dates, assets, ) adjusted_arrays = [ adjusted_array(raw_array, mask, col_adjustments) for raw_array, col_adjustments in zip(raw_arrays, adjustments) ] return dict(zip(columns, adjusted_arrays))
def load_adjusted_array(self, columns, mask): """ Load data from our stored baseline. """ if len(columns) != 1: raise ValueError( "Can't load multiple columns with DataFrameLoader" ) elif columns[0] != self.column: raise ValueError("Can't load unknown column %s" % columns[0]) dates, assets, mask_values = mask.index, mask.columns, mask.values date_indexer = self.dates.get_indexer(dates) assets_indexer = self.assets.get_indexer(assets) # Boolean arrays with True on matched entries good_dates = (date_indexer != -1) good_assets = (assets_indexer != -1) return [adjusted_array( # Pull out requested columns/rows from our baseline data. data=self.baseline[ix_(date_indexer, assets_indexer)], # Mask out requested columns/rows that didnt match. mask=(good_assets & good_dates[:, None]) & mask_values, adjustments=self.format_adjustments(dates, assets), )]
def load_adjusted_array(self, columns, dates, assets, mask): """ Load data from our stored baseline. """ if len(columns) != 1: raise ValueError( "Can't load multiple columns with DataFrameLoader" ) elif columns[0] != self.column: raise ValueError("Can't load unknown column %s" % columns[0]) date_indexer = self.dates.get_indexer(dates) assets_indexer = self.assets.get_indexer(assets) # Boolean arrays with True on matched entries good_dates = (date_indexer != -1) good_assets = (assets_indexer != -1) arrays = [adjusted_array( # Pull out requested columns/rows from our baseline data. data=self.baseline[ix_(date_indexer, assets_indexer)], # Mask out requested columns/rows that didnt match. mask=(good_assets & good_dates[:, None]) & mask, adjustments=self.format_adjustments(dates, assets), )] return dict(zip(columns, arrays))
def test_inspect(self): data = arange(15, dtype=float).reshape(5, 3) adj_array = adjusted_array( data, NOMASK, {4: [Float64Multiply(2, 3, 0, 0, 4.0)]}, ) expected = dedent( """\ Adjusted Array: Data: array([[ 0., 1., 2.], [ 3., 4., 5.], [ 6., 7., 8.], [ 9., 10., 11.], [ 12., 13., 14.]]) Adjustments: {4: [Float64Multiply(first_row=2, last_row=3, first_col=0, \ last_col=0, value=4.000000)]} """ ) self.assertEqual(expected, adj_array.inspect())
def load_adjusted_array(self, columns, dates, assets, mask): # load_adjusted_array is called with dates on which the user's algo # will be shown data, which means we need to return the data that would # be known at the start of each date. We assume that the latest data # known on day N is the data from day (N - 1), so we shift all query # dates back by a day. start_date, end_date = _shift_dates( self._calendar, dates[0], dates[-1], shift=1, ) raw_arrays = self.raw_price_loader.load_raw_arrays( columns, start_date, end_date, assets, ) adjustments = self.adjustments_loader.load_adjustments( columns, dates, assets, ) return [ adjusted_array(raw_array, mask, col_adjustments) for raw_array, col_adjustments in zip(raw_arrays, adjustments) ]
def test_array_views_arent_writable(self): data = arange(30, dtype=float).reshape(6, 5) adj_array = adjusted_array(data, NOMASK, {}) for frame in adj_array.traverse(3): with self.assertRaises(ValueError): frame[0, 0] = 5.0
def test_no_adjustments(self, name, data, lookback, adjustments, expected): array = adjusted_array( data, NOMASK, adjustments, ) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse(lookback) for yielded, expected_yield in zip_longest(window_iter, expected): assert_array_equal(yielded, expected_yield)
def test_invalid_lookback(self): data = arange(30, dtype=float).reshape(6, 5) adj_array = adjusted_array(data, NOMASK, {}) with self.assertRaises(WindowLengthTooLong): adj_array.traverse(7) with self.assertRaises(WindowLengthNotPositive): adj_array.traverse(0) with self.assertRaises(WindowLengthNotPositive): adj_array.traverse(-1)
def test_overwrite_adjustment_cases(self, name, data, lookback, adjustments, expected): array = adjusted_array( data, NOMASK, adjustments, ) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse(lookback) for yielded, expected_yield in zip_longest(window_iter, expected): assert_array_equal(yielded, expected_yield)
def load_adjusted_array(self, columns, mask): dates, assets = mask.index, mask.columns raw_arrays = self.raw_price_loader.load_raw_arrays( columns, dates, assets, ) adjustments = self.adjustments_loader.load_adjustments( columns, dates, assets, ) return [ adjusted_array(raw_array, mask.values, col_adjustments) for raw_array, col_adjustments in zip(raw_arrays, adjustments) ]
def test_inspect(self): data = arange(15, dtype=float).reshape(5, 3) adj_array = adjusted_array( data, NOMASK, {4: [Float64Multiply(2, 3, 0, 0, 4.0)]}, ) expected = dedent("""\ Adjusted Array: Data: array([[ 0., 1., 2.], [ 3., 4., 5.], [ 6., 7., 8.], [ 9., 10., 11.], [ 12., 13., 14.]]) Adjustments: {4: [Float64Multiply(first_row=2, last_row=3, first_col=0, \ last_col=0, value=4.000000)]} """) self.assertEqual(expected, adj_array.inspect())
def _load_dataset(self, dates, assets, mask, columns): try: (dataset,) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, resources = self[dataset] have_sids = SID_FIELD_NAME in expr.fields assets = list(map(int, assets)) # coerce from numpy.int64 fields = list(map(dataset_name, columns)) query_fields = fields + [AD_FIELD_NAME, TS_FIELD_NAME] + ( [SID_FIELD_NAME] if have_sids else [] ) def where(e): """Create the query to run against the resources. Parameters ---------- e : Expr The baseline or deltas expression. Returns ------- q : Expr The query to run. """ ts = e[TS_FIELD_NAME] # Hack to get the lower bound to query: # This must be strictly executed because the data for `ts` will # be removed from scope too early otherwise. lower = odo(ts[ts <= dates[0]].max(), pd.Timestamp) selection = ts <= dates[-1] if have_sids: selection &= e[SID_FIELD_NAME].isin(assets) if lower is not pd.NaT: selection &= ts >= lower return e[selection][query_fields] extra_kwargs = {'d': resources} if resources else {} materialized_expr = odo(where(expr), pd.DataFrame, **extra_kwargs) materialized_deltas = ( odo(where(deltas), pd.DataFrame, **extra_kwargs) if deltas is not None else pd.DataFrame(columns=query_fields) ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) if have_sids: # Unstack by the sid so that we get a multi-index on the columns # of datacolumn, sid. sparse_output = sparse_output.set_index( [TS_FIELD_NAME, SID_FIELD_NAME], ).unstack() sparse_deltas = non_novel_deltas.set_index( [TS_FIELD_NAME, SID_FIELD_NAME], ).unstack() dense_output = sparse_output.reindex(dates, method='ffill') cols = dense_output.columns dense_output = dense_output.reindex( columns=pd.MultiIndex.from_product( (cols.levels[0], assets), names=cols.names, ), ) adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # We use the column view to make an array per asset. column_view = compose( # We need to copy this because we need a concrete ndarray. # The `repeat_last_axis` call will give us a fancy strided # array which uses a buffer to represent `len(assets)` columns. # The engine puts nans at the indicies for which we do not have # sid information so that the nan-aware reductions still work. # A future change to the engine would be to add first class # support for macro econimic datasets. copy, partial(repeat_last_axis, count=len(assets)), ) sparse_output = sparse_output.set_index(TS_FIELD_NAME) dense_output = sparse_output.reindex(dates, method='ffill') sparse_deltas = non_novel_deltas.set_index(TS_FIELD_NAME) adjustments_from_deltas = adjustments_from_deltas_no_sids for column_idx, column in enumerate(columns): column_name = column.name yield column, adjusted_array( column_view( dense_output[column_name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output.index, column_idx, column_name, assets, sparse_deltas, ) )