def test_inspect(self): data = arange(15, dtype=float).reshape(5, 3) adj_array = AdjustedArray( data, NOMASK, {4: [Float64Multiply(2, 3, 0, 0, 4.0)]}, float('nan'), ) expected = dedent( """\ Adjusted Array (float64): Data: array([[ 0., 1., 2.], [ 3., 4., 5.], [ 6., 7., 8.], [ 9., 10., 11.], [ 12., 13., 14.]]) Adjustments: {4: [Float64Multiply(first_row=2, last_row=3, first_col=0, \ last_col=0, value=4.000000)]} """ ) got = adj_array.inspect() self.assertEqual(expected, got)
def _test_inspect(self): data = arange(15, dtype=float).reshape(5, 3) adj_array = AdjustedArray( data, NOMASK, {4: [Float64Multiply(2, 3, 0, 0, 4.0)]}, float('nan'), ) expected = dedent("""\ Adjusted Array (float64): Data: array([[ 0., 1., 2.], [ 3., 4., 5.], [ 6., 7., 8.], [ 9., 10., 11.], [ 12., 13., 14.]]) Adjustments: {4: [Float64Multiply(first_row=2, last_row=3, first_col=0, \ last_col=0, value=4.000000)]} """) got = adj_array.inspect() self.assertEqual(expected, got)
def test_array_views_arent_writable(self): data = arange(30, dtype=float).reshape(6, 5) adj_array = AdjustedArray(data, NOMASK, {}, float('nan')) for frame in adj_array.traverse(3): with self.assertRaises(ValueError): frame[0, 0] = 5.0
def load_adjusted_array(self, columns, dates, assets, mask): # load_adjusted_array is called with dates on which the user's algo # will be shown data, which means we need to return the data that would # be known at the start of each date. We assume that the latest data # known on day N is the data from day (N - 1), so we shift all query # dates back by a day. start_date, end_date = _shift_dates( self._all_sessions, dates[0], dates[-1], shift=1, ) colnames = [c.name for c in columns] raw_arrays = self.raw_price_loader.load_raw_arrays( colnames, start_date, end_date, assets, ) out = {} for c, c_raw in zip(columns, raw_arrays): out[c] = AdjustedArray( c_raw.astype(c.dtype), mask, {}, c.missing_value, ) return out
def test_bad_input(self): msg = "Mask shape \(2L?, 3L?\) != data shape \(5L?, 5L?\)" data = arange(25).reshape(5, 5) bad_mask = array([[0, 1, 1], [0, 0, 1]], dtype=bool) with self.assertRaisesRegexp(ValueError, msg): AdjustedArray(data, bad_mask, {}, missing_value=-1)
def _test_bollinger_bands(self, window_length, k, mask_last_sid): closes = self.closes(mask_last_sid=mask_last_sid) mask = ~np.isnan(closes) bbands = BollingerBands(window_length=window_length, k=k) expected = self.expected_bbands(window_length, k, closes) self.check_terms( terms={ 'upper': bbands.upper, 'middle': bbands.middle, 'lower': bbands.lower, }, expected={ 'upper': expected[0], 'middle': expected[1], 'lower': expected[2], }, initial_workspace={ USEquityPricing.close: AdjustedArray( data=closes, mask=mask, adjustments={}, missing_value=np.nan, ), }, mask=self.build_mask(mask), )
def load_adjusted_array(self, columns, dates, assets, mask): """ Load data from our stored baseline. """ column = self.column if len(columns) != 1: raise ValueError( "Can't load multiple columns with DataFrameLoader") elif columns[0] != column: raise ValueError("Can't load unknown column %s" % columns[0]) date_indexer = self.dates.get_indexer(dates) assets_indexer = self.assets.get_indexer(assets) # Boolean arrays with True on matched entries good_dates = (date_indexer != -1) good_assets = (assets_indexer != -1) return { column: AdjustedArray( # Pull out requested columns/rows from our baseline data. data=self.baseline[ix_(date_indexer, assets_indexer)], # Mask out requested columns/rows that didnt match. mask=(good_assets & as_column(good_dates)) & mask, adjustments=self.format_adjustments(dates, assets), missing_value=column.missing_value, ), }
def test_no_adjustments(self, name, data, lookback, adjustments, missing_value, perspective_offset, expected_output): array = AdjustedArray(data, NOMASK, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. in_out = zip(array.traverse(lookback), expected_output) for yielded, expected_yield in in_out: check_arrays(yielded, expected_yield)
def test_invalid_lookback(self): data = arange(30, dtype=float).reshape(6, 5) adj_array = AdjustedArray(data, NOMASK, {}, float('nan')) with self.assertRaises(WindowLengthTooLong): adj_array.traverse(7) with self.assertRaises(WindowLengthNotPositive): adj_array.traverse(0) with self.assertRaises(WindowLengthNotPositive): adj_array.traverse(-1)
def test_overwrite_adjustment_cases(self, name, baseline, lookback, adjustments, missing_value, perspective_offset, expected): array = AdjustedArray(baseline, NOMASK, adjustments, missing_value) for _ in range(2): # Iterate 2x ensure adjusted_arrays are re-usable. window_iter = array.traverse( lookback, perspective_offset=perspective_offset, ) for yielded, expected_yield in zip_longest(window_iter, expected): check_arrays(yielded, expected_yield)
def test_masking(self, dtype, missing_value, window_length): missing_value = coerce_to_dtype(dtype, missing_value) baseline_ints = arange(15).reshape(5, 3) baseline = baseline_ints.astype(dtype) mask = (baseline_ints % 2).astype(bool) masked_baseline = where(mask, baseline, missing_value) array = AdjustedArray( baseline, mask, adjustments={}, missing_value=missing_value, ) gen_expected = moving_window(masked_baseline, window_length) gen_actual = array.traverse(window_length) for expected, actual in zip(gen_expected, gen_actual): check_arrays(expected, actual)
def load_adjusted_array(self, columns, dates, assets, mask): # load_adjusted_array is called with dates on which the user's algo # will be shown data, which means we need to return the data that would # be known at the start of each date. We assume that the latest data # known on day N is the data from day (N - 1), so we shift all query # dates back by a day. start_date, end_date = _shift_dates( self._all_sessions, dates[0], dates[-1], shift=1, ) colnames = [c.name for c in columns] if len(assets) == 0: raise ValueError('Pipeline cannot load data with eligible assets.') exchange_names = [] for asset in assets: if asset.exchange not in exchange_names: exchange_names.append(asset.exchange) exchange = get_exchange(exchange_names[0]) reader = exchange.bundle.get_reader(self.data_frequency) raw_arrays = reader.load_raw_arrays( colnames, start_date, end_date, assets, ) out = {} for c, c_raw in zip(columns, raw_arrays): out[c] = AdjustedArray( c_raw.astype(c.dtype), mask, {}, c.missing_value, ) return out
def test_masking_with_strings(self, dtype, missing_value, window_length): missing_value = coerce_to_dtype(dtype, missing_value) baseline_ints = arange(15).reshape(5, 3) # Coerce to string first so that coercion to object gets us an array of # string objects. baseline = baseline_ints.astype(str).astype(dtype) mask = (baseline_ints % 2).astype(bool) masked_baseline = LabelArray(baseline, missing_value=missing_value) masked_baseline[~mask] = missing_value array = AdjustedArray( baseline, mask, adjustments={}, missing_value=missing_value, ) gen_expected = moving_window(masked_baseline, window_length) gen_actual = array.traverse(window_length=window_length) for expected, actual in zip(gen_expected, gen_actual): check_arrays(expected, actual)
def _load_dataset(self, dates, assets, mask, columns): try: (dataset, ) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, checkpoints, odo_kwargs, apply_deltas_adjustments = self[ dataset] have_sids = (dataset.ndim == 2) asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME } | ({SID_FIELD_NAME} if have_sids else set()) requested_columns = set(map(getname, columns)) colnames = sorted(added_query_fields | requested_columns) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] <= upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) lower, materialized_checkpoints = get_materialized_checkpoints( checkpoints, colnames, lower_dt, odo_kwargs) materialized_expr = self.pool.apply_async(collect_expr, (expr, lower)) materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower)) if deltas is not None else pd.DataFrame(columns=colnames)) if materialized_checkpoints is not None: materialized_expr = pd.concat( ( materialized_checkpoints, materialized_expr.get(), ), ignore_index=True, copy=False, ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets)] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype( 'datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) # If we ever have cases where we find out about multiple asof_dates' # data on the same TS, we want to make sure that last_in_date_group # selects the correct last asof_date's value. sparse_output.sort_values(AD_FIELD_NAME, inplace=True) non_novel_deltas.sort_values(AD_FIELD_NAME, inplace=True) if AD_FIELD_NAME not in requested_columns: sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) sparse_deltas = last_in_date_group(non_novel_deltas, dates, assets, reindex=False, have_sids=have_sids) dense_output = last_in_date_group(sparse_output, dates, assets, reindex=True, have_sids=have_sids) ffill_across_cols(dense_output, columns, {c.name: c.name for c in columns}) # By default, no non-novel deltas are applied. def no_adjustments_from_deltas(*args): return {} adjustments_from_deltas = no_adjustments_from_deltas if have_sids: if apply_deltas_adjustments: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # If we do not have sids, use the column view to make a single # column vector which is unassociated with any assets. column_view = op.itemgetter(np.s_[:, np.newaxis]) if apply_deltas_adjustments: adjustments_from_deltas = adjustments_from_deltas_no_sids mask = np.full( shape=(len(mask), 1), fill_value=True, dtype=bool_dtype, ) return { column: AdjustedArray( column_view( dense_output[column.name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column.name, asset_idx, sparse_deltas, ), column.missing_value, ) for column_idx, column in enumerate(columns) }