def _test_inspect(self): data = arange(15, dtype=float).reshape(5, 3) adj_array = AdjustedArray( data, NOMASK, {4: [Float64Multiply(2, 3, 0, 0, 4.0)]}, float('nan'), ) expected = dedent("""\ Adjusted Array (float64): Data: array([[ 0., 1., 2.], [ 3., 4., 5.], [ 6., 7., 8.], [ 9., 10., 11.], [ 12., 13., 14.]]) Adjustments: {4: [Float64Multiply(first_row=2, last_row=3, first_col=0, \ last_col=0, value=4.000000)]} """) got = adj_array.inspect() self.assertEqual(expected, got)
def expected_adjustments(self, start_date, end_date): price_adjustments = {} volume_adjustments = {} query_days = self.calendar_days_between(start_date, end_date) start_loc = query_days.get_loc(start_date) for table in SPLITS, MERGERS, DIVIDENDS_EXPECTED: for eff_date_secs, ratio, sid in table.itertuples(index=False): eff_date = Timestamp(eff_date_secs, unit='s', tz='UTC') # Ignore adjustments outside the query bounds. if not (start_date <= eff_date <= end_date): continue eff_date_loc = query_days.get_loc(eff_date) delta = eff_date_loc - start_loc # Pricing adjustments should be applied on the date # corresponding to the effective date of the input data. They # should affect all rows **before** the effective date. price_adjustments.setdefault(delta, []).append( Float64Multiply( first_row=0, last_row=delta, first_col=sid - 1, last_col=sid - 1, value=ratio, )) # Volume is *inversely* affected by *splits only*. if table is SPLITS: volume_adjustments.setdefault(delta, []).append( Float64Multiply( first_row=0, last_row=delta, first_col=sid - 1, last_col=sid - 1, value=1.0 / ratio, )) return price_adjustments, volume_adjustments
def split_adjustment(sid, volume): """The splits occur at index 252 // 2 with a ratio of (sid + 1):1 """ idx = 252 // 2 return { idx: [ Float64Multiply( first_row=0, last_row=idx, first_col=sid, last_col=sid, value=(identity if volume else op.truediv(1))(sid + 2), ) ], }
def dividend_adjustment(sid, which): """The dividends occur at indices 252 // 4 and 3 * 252 / 4 with a cash amount of sid + 1 / 10 and sid + 2 / 10 """ if which == 'first': idx = 252 // 4 else: idx = 3 * 252 // 4 return { idx: [ Float64Multiply( first_row=0, last_row=idx, first_col=sid, last_col=sid, value=float(1 - ((sid + 1 + (which == 'second')) / 10) / (idx - 1 + sid * 10000 + 2000)), ) ], }
def test_adjustments(self): data = arange(100).reshape(self.ndates, self.nsids) baseline = DataFrame(data, index=self.dates, columns=self.sids) # Use the dates from index 10 on and sids 1-3. dates_slice = slice(10, None, None) sids_slice = slice(1, 4, None) # Adjustments that should actually affect the output. relevant_adjustments = [ { 'sid': 1, 'start_date': None, 'end_date': self.dates[15], 'apply_date': self.dates[16], 'value': 0.5, 'kind': MULTIPLY, }, { 'sid': 2, 'start_date': self.dates[5], 'end_date': self.dates[15], 'apply_date': self.dates[16], 'value': 1.0, 'kind': ADD, }, { 'sid': 2, 'start_date': self.dates[15], 'end_date': self.dates[16], 'apply_date': self.dates[17], 'value': 1.0, 'kind': ADD, }, { 'sid': 3, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': 99.0, 'kind': OVERWRITE, }, ] # These adjustments shouldn't affect the output. irrelevant_adjustments = [ { # Sid Not Requested 'sid': 0, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': -9999.0, 'kind': OVERWRITE, }, { # Sid Unknown 'sid': 9999, 'start_date': self.dates[16], 'end_date': self.dates[17], 'apply_date': self.dates[18], 'value': -9999.0, 'kind': OVERWRITE, }, { # Date Not Requested 'sid': 2, 'start_date': self.dates[1], 'end_date': self.dates[2], 'apply_date': self.dates[3], 'value': -9999.0, 'kind': OVERWRITE, }, { # Date Before Known Data 'sid': 2, 'start_date': self.dates[0] - (2 * self.trading_day), 'end_date': self.dates[0] - self.trading_day, 'apply_date': self.dates[0] - self.trading_day, 'value': -9999.0, 'kind': OVERWRITE, }, { # Date After Known Data 'sid': 2, 'start_date': self.dates[-1] + self.trading_day, 'end_date': self.dates[-1] + (2 * self.trading_day), 'apply_date': self.dates[-1] + (3 * self.trading_day), 'value': -9999.0, 'kind': OVERWRITE, }, ] adjustments = DataFrame(relevant_adjustments + irrelevant_adjustments) loader = DataFrameLoader( USEquityPricing.close, baseline, adjustments=adjustments, ) expected_baseline = baseline.iloc[dates_slice, sids_slice] formatted_adjustments = loader.format_adjustments( self.dates[dates_slice], self.sids[sids_slice], ) expected_formatted_adjustments = { 6: [ Float64Multiply( first_row=0, last_row=5, first_col=0, last_col=0, value=0.5, ), Float64Add( first_row=0, last_row=5, first_col=1, last_col=1, value=1.0, ), ], 7: [ Float64Add( first_row=5, last_row=6, first_col=1, last_col=1, value=1.0, ), ], 8: [ Float64Overwrite( first_row=6, last_row=7, first_col=2, last_col=2, value=99.0, ) ], } self.assertEqual(formatted_adjustments, expected_formatted_adjustments) mask = self.mask[dates_slice, sids_slice] with patch('catalyst.pipeline.loaders.frame.AdjustedArray') as m: loader.load_adjusted_array( columns=[USEquityPricing.close], dates=self.dates[dates_slice], assets=self.sids[sids_slice], mask=mask, ) self.assertEqual(m.call_count, 1) args, kwargs = m.call_args assert_array_equal(kwargs['data'], expected_baseline.values) assert_array_equal(kwargs['mask'], mask) self.assertEqual(kwargs['adjustments'], expected_formatted_adjustments)
def _get_adjustments_in_range(self, asset, dts, field): """ Get the Float64Multiply objects to pass to an AdjustedArrayWindow. For the use of AdjustedArrayWindow in the loader, which looks back from current simulation time back to a window of data the dictionary is structured with: - the key into the dictionary for adjustments is the location of the day from which the window is being viewed. - the start of all multiply objects is always 0 (in each window all adjustments are overlapping) - the end of the multiply object is the location before the calendar location of the adjustment action, making all days before the event adjusted. Parameters ---------- asset : Asset The assets for which to get adjustments. dts : iterable of datetime64-like The dts for which adjustment data is needed. field : str OHLCV field for which to get the adjustments. Returns ------- out : dict[loc -> Float64Multiply] The adjustments as a dict of loc -> Float64Multiply """ sid = int(asset) start = normalize_date(dts[0]) end = normalize_date(dts[-1]) adjs = {} if field != 'volume': mergers = self._adjustments_reader.get_adjustments_for_sid( 'mergers', sid) for m in mergers: dt = m[0] if start < dt <= end: end_loc = dts.searchsorted(dt) adj_loc = end_loc mult = Float64Multiply(0, end_loc - 1, 0, 0, m[1]) try: adjs[adj_loc].append(mult) except KeyError: adjs[adj_loc] = [mult] divs = self._adjustments_reader.get_adjustments_for_sid( 'dividends', sid) for d in divs: dt = d[0] if start < dt <= end: end_loc = dts.searchsorted(dt) adj_loc = end_loc mult = Float64Multiply(0, end_loc - 1, 0, 0, d[1]) try: adjs[adj_loc].append(mult) except KeyError: adjs[adj_loc] = [mult] splits = self._adjustments_reader.get_adjustments_for_sid( 'splits', sid) for s in splits: dt = s[0] if start < dt <= end: if field == 'volume': ratio = 1.0 / s[1] else: ratio = s[1] end_loc = dts.searchsorted(dt) adj_loc = end_loc mult = Float64Multiply(0, end_loc - 1, 0, 0, ratio) try: adjs[adj_loc].append(mult) except KeyError: adjs[adj_loc] = [mult] return adjs
def test_ingest(self): calendar = get_calendar('NYSE') sessions = calendar.sessions_in_range(self.START_DATE, self.END_DATE) minutes = calendar.minutes_for_sessions_in_range( self.START_DATE, self.END_DATE, ) sids = tuple(range(3)) equities = make_simple_equity_info( sids, self.START_DATE, self.END_DATE, ) daily_bar_data = make_bar_data(equities, sessions) minute_bar_data = make_bar_data(equities, minutes) first_split_ratio = 0.5 second_split_ratio = 0.1 splits = pd.DataFrame.from_records([ { 'effective_date': str_to_seconds('2014-01-08'), 'ratio': first_split_ratio, 'sid': 0, }, { 'effective_date': str_to_seconds('2014-01-09'), 'ratio': second_split_ratio, 'sid': 1, }, ]) @self.register( 'bundle', calendar_name='NYSE', start_session=self.START_DATE, end_session=self.END_DATE, ) def bundle_ingest(environ, asset_db_writer, minute_bar_writer, daily_bar_writer, adjustment_writer, calendar, start_session, end_session, cache, show_progress, output_dir): assert_is(environ, self.environ) asset_db_writer.write(equities=equities) minute_bar_writer.write(minute_bar_data) daily_bar_writer.write(daily_bar_data) adjustment_writer.write(splits=splits) assert_is_instance(calendar, TradingCalendar) assert_is_instance(cache, dataframe_cache) assert_is_instance(show_progress, bool) self.ingest('bundle', environ=self.environ) bundle = self.load('bundle', environ=self.environ) assert_equal(set(bundle.asset_finder.sids), set(sids)) columns = 'open', 'high', 'low', 'close', 'volume' actual = bundle.equity_minute_bar_reader.load_raw_arrays( columns, minutes[0], minutes[-1], sids, ) for actual_column, colname in zip(actual, columns): assert_equal( actual_column, expected_bar_values_2d(minutes, equities, colname), msg=colname, ) actual = bundle.equity_daily_bar_reader.load_raw_arrays( columns, self.START_DATE, self.END_DATE, sids, ) for actual_column, colname in zip(actual, columns): assert_equal( actual_column, expected_bar_values_2d(sessions, equities, colname), msg=colname, ) adjustments_for_cols = bundle.adjustment_reader.load_adjustments( columns, sessions, pd.Index(sids), ) for column, adjustments in zip(columns, adjustments_for_cols[:-1]): # iterate over all the adjustments but `volume` assert_equal( adjustments, { 2: [ Float64Multiply( first_row=0, last_row=2, first_col=0, last_col=0, value=first_split_ratio, ) ], 3: [ Float64Multiply( first_row=0, last_row=3, first_col=1, last_col=1, value=second_split_ratio, ) ], }, msg=column, ) # check the volume, the value should be 1/ratio assert_equal( adjustments_for_cols[-1], { 2: [ Float64Multiply( first_row=0, last_row=2, first_col=0, last_col=0, value=1 / first_split_ratio, ) ], 3: [ Float64Multiply( first_row=0, last_row=3, first_col=1, last_col=1, value=1 / second_split_ratio, ) ], }, msg='volume', )
def _expected_data(self, asset_finder): sids = { symbol: asset_finder.lookup_symbol( symbol, self.asset_start, ).sid for symbol in self.symbols } def per_symbol(symbol): df = pd.read_csv( test_resource_path('quandl_samples', symbol + '.csv.gz'), parse_dates=['Date'], index_col='Date', usecols=[ 'Open', 'High', 'Low', 'Close', 'Volume', 'Date', 'Ex-Dividend', 'Split Ratio', ], na_values=['NA'], ).rename( columns={ 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume', 'Date': 'date', 'Ex-Dividend': 'ex_dividend', 'Split Ratio': 'split_ratio', }) df['sid'] = sids[symbol] return df all_ = pd.concat(map(per_symbol, self.symbols)).set_index( 'sid', append=True, ).unstack() # fancy list comprehension with statements @list @apply def pricing(): for column in self.columns: vs = all_[column].values if column == 'volume': vs = np.nan_to_num(vs) yield vs # the first index our written data will appear in the files on disk start_idx = ( self.calendar.all_sessions.get_loc(self.asset_start, 'ffill') + 1) # convert an index into the raw dataframe into an index into the # final data i = op.add(start_idx) def expected_dividend_adjustment(idx, symbol): sid = sids[symbol] return (1 - all_.ix[idx, ('ex_dividend', sid)] / all_.ix[idx - 1, ('close', sid)]) adjustments = [ # ohlc { # dividends i(24): [ Float64Multiply( first_row=0, last_row=i(24), first_col=sids['AAPL'], last_col=sids['AAPL'], value=expected_dividend_adjustment(24, 'AAPL'), ) ], i(87): [ Float64Multiply( first_row=0, last_row=i(87), first_col=sids['AAPL'], last_col=sids['AAPL'], value=expected_dividend_adjustment(87, 'AAPL'), ) ], i(150): [ Float64Multiply( first_row=0, last_row=i(150), first_col=sids['AAPL'], last_col=sids['AAPL'], value=expected_dividend_adjustment(150, 'AAPL'), ) ], i(214): [ Float64Multiply( first_row=0, last_row=i(214), first_col=sids['AAPL'], last_col=sids['AAPL'], value=expected_dividend_adjustment(214, 'AAPL'), ) ], i(31): [ Float64Multiply( first_row=0, last_row=i(31), first_col=sids['MSFT'], last_col=sids['MSFT'], value=expected_dividend_adjustment(31, 'MSFT'), ) ], i(90): [ Float64Multiply( first_row=0, last_row=i(90), first_col=sids['MSFT'], last_col=sids['MSFT'], value=expected_dividend_adjustment(90, 'MSFT'), ) ], i(222): [ Float64Multiply( first_row=0, last_row=i(222), first_col=sids['MSFT'], last_col=sids['MSFT'], value=expected_dividend_adjustment(222, 'MSFT'), ) ], # splits i(108): [ Float64Multiply( first_row=0, last_row=i(108), first_col=sids['AAPL'], last_col=sids['AAPL'], value=1.0 / 7.0, ) ], }, ] * (len(self.columns) - 1) + [ # volume { i(108): [ Float64Multiply( first_row=0, last_row=i(108), first_col=sids['AAPL'], last_col=sids['AAPL'], value=7.0, ) ], } ] return pricing, adjustments