def _pipeline_output_index(dates, assets, mask): """ Create a MultiIndex for a pipeline output. Parameters ---------- dates : pd.DatetimeIndex Row labels for ``mask``. assets : pd.Index Column labels for ``mask``. mask : np.ndarray[bool] Mask array indicating date/asset pairs that should be included in output index. Returns ------- index : pd.MultiIndex MultiIndex containing (date, asset) pairs corresponding to ``True`` values in ``mask``. """ date_labels = repeat_last_axis(arange(len(dates)), len(assets))[mask] asset_labels = repeat_first_axis(arange(len(assets)), len(dates))[mask] return MultiIndex( levels=[dates, assets], labels=[date_labels, asset_labels], # TODO: We should probably add names for these. names=[None, None], verify_integrity=False, )
def _compute(self, arrays, dates, assets, mask): is_my_asset = assets == self._asset.sid out = repeat_first_axis(is_my_asset, len(mask)) # Raise an exception if `self._asset` does not exist for the entirety # of the timeframe over which we are computing. if (is_my_asset.sum() != 1) or ((out & mask).sum() != len(mask)): raise NonExistentAssetInTimeFrame(asset=self._asset, start_date=dates[0], end_date=dates[-1]) return out
def _compute(self, arrays, dates, assets, mask): is_my_asset = (assets == self._asset.sid) out = repeat_first_axis(is_my_asset, len(mask)) # Raise an exception if `self._asset` does not exist for the entirety # of the timeframe over which we are computing. if (is_my_asset.sum() != 1) or ((out & mask).sum() != len(mask)): raise NonExistentAssetInTimeFrame( asset=self._asset, start_date=dates[0], end_date=dates[-1], ) return out
def _to_narrow(self, data, mask, dates, assets): """ Convert raw computed pipeline results into a DataFrame for public APIs. Parameters ---------- data : dict[str -> ndarray[ndim=2]] Dict mapping column names to computed results. mask : ndarray[bool, ndim=2] Mask array of values to keep. dates : ndarray[datetime64, ndim=1] Row index for arrays `data` and `mask` assets : ndarray[int64, ndim=2] Column index for arrays `data` and `mask` Returns ------- results : pd.DataFrame The indices of `results` are as follows: index : two-tiered MultiIndex of (date, asset). Contains an entry for each (date, asset) pair corresponding to a `True` value in `mask`. columns : Index of str One column per entry in `data`. If mask[date, asset] is True, then result.loc[(date, asset), colname] will contain the value of data[colname][date, asset]. """ if not mask.any(): # Manually handle the empty DataFrame case. This is a workaround # to pandas failing to tz_localize an empty dataframe with a # MultiIndex. It also saves us the work of applying a known-empty # mask to each array. # # Slicing `dates` here to preserve pandas metadata. empty_dates = dates[:0] empty_assets = array([], dtype=object) return DataFrame( data={ name: array([], dtype=arr.dtype) for name, arr in iteritems(data) }, index=MultiIndex.from_arrays([empty_dates, empty_assets]), ) resolved_assets = array(self._finder.retrieve_all(assets)) dates_kept = repeat_last_axis(dates.values, len(assets))[mask] assets_kept = repeat_first_axis(resolved_assets, len(dates))[mask] return DataFrame( data={name: arr[mask] for name, arr in iteritems(data)}, index=MultiIndex.from_arrays([dates_kept, assets_kept]), ).tz_localize('UTC', level=0)
def load_adjusted_array(self, domain, columns, dates, sids, mask): # load_adjusted_array is called with dates on which the user's algo # will be shown data, which means we need to return the data that would # be known at the **start** of each date. We assume that the latest # data known on day N is the data from day (N - 1), so we shift all # query dates back by a trading session. sessions = domain.all_sessions() shifted_dates = shift_dates(sessions, dates[0], dates[-1], shift=1) ohlcv_cols, currency_cols = self._split_column_types(columns) del columns # From here on we should use ohlcv_cols or currency_cols. ohlcv_colnames = [c.name for c in ohlcv_cols] raw_ohlcv_arrays = self.raw_price_reader.load_raw_arrays( ohlcv_colnames, shifted_dates[0], shifted_dates[-1], sids, ) # Currency convert raw_arrays in place if necessary. We use shifted # dates to load currency conversion rates to make them line up with # dates used to fetch prices. self._inplace_currency_convert( ohlcv_cols, raw_ohlcv_arrays, shifted_dates, sids, ) adjustments = self.adjustments_reader.load_pricing_adjustments( ohlcv_colnames, dates, sids, ) out = {} for c, c_raw, c_adjs in zip(ohlcv_cols, raw_ohlcv_arrays, adjustments): out[c] = AdjustedArray( c_raw.astype(c.dtype), c_adjs, c.missing_value, ) for c in currency_cols: codes_1d = self.raw_price_reader.currency_codes(sids) codes = repeat_first_axis(codes_1d, len(dates)) out[c] = AdjustedArray( codes, adjustments={}, missing_value=None, ) return out
def compute(self, today, assets, out, close): x_matrix = repeat_last_axis( (self.window_length - 1) / 2 - self._x, len(assets), ) y_bar = np.nanmean(close, axis=0) y_bars = repeat_first_axis(y_bar, self.window_length) y_matrix = close - y_bars out[:] = preprocess(-np.divide((x_matrix * y_matrix).sum(axis=0) / self._x_var, self.window_length))
def _to_narrow(self, data, mask, dates, assets): """ Convert raw computed pipeline results into a DataFrame for public APIs. Parameters ---------- data : dict[str -> ndarray[ndim=2]] Dict mapping column names to computed results. mask : ndarray[bool, ndim=2] Mask array of values to keep. dates : ndarray[datetime64, ndim=1] Row index for arrays `data` and `mask` assets : ndarray[int64, ndim=2] Column index for arrays `data` and `mask` Returns ------- results : pd.DataFrame The indices of `results` are as follows: index : two-tiered MultiIndex of (date, asset). Contains an entry for each (date, asset) pair corresponding to a `True` value in `mask`. columns : Index of str One column per entry in `data`. If mask[date, asset] is True, then result.loc[(date, asset), colname] will contain the value of data[colname][date, asset]. """ resolved_assets = array(self._finder.retrieve_all(assets)) dates_kept = repeat_last_axis(dates.values, len(assets))[mask] assets_kept = repeat_first_axis(resolved_assets, len(dates))[mask] return DataFrame( data={name: arr[mask] for name, arr in iteritems(data)}, index=MultiIndex.from_arrays([dates_kept, assets_kept]), ).tz_localize('UTC', level=0)
def _compute(self, arrays, dates, sids, mask): my_columns = sids.isin(self.params['sids']) return repeat_first_axis(my_columns, len(mask)) & mask
def _to_narrow(self, terms, data, mask, dates, symbols): """ Convert raw computed pipeline results into a DataFrame for public APIs. Parameters ---------- terms : dict[str -> Term] Dict mapping column names to terms. data : dict[str -> ndarray[ndim=2]] Dict mapping column names to computed results for those names. mask : ndarray[bool, ndim=2] Mask array of values to keep. dates : ndarray[datetime64, ndim=1] Row index for arrays `data` and `mask` symbols : list Column index Returns ------- results : pd.DataFrame The indices of `results` are as follows: index : two-tiered MultiIndex of (date, asset). Contains an entry for each (date, asset) pair corresponding to a `True` value in `mask`. columns : Index of str One column per entry in `data`. If mask[date, asset] is True, then result.loc[(date, asset), colname] will contain the value of data[colname][date, asset]. """ assert len(dates) == 1 if not mask.any(): # Manually handle the empty DataFrame case. This is a workaround # to pandas failing to tz_localize an empty dataframe with a # MultiIndex. It also saves us the work of applying a known-empty # mask to each array. # # Slicing `dates` here to preserve pandas metadata. # empty_dates = dates[:0] empty_assets = array([], dtype=object) return DataFrame( data={ name: array([], dtype=arr.dtype) for name, arr in iteritems(data) }, index=pd.Index(empty_assets), # index=MultiIndex.from_arrays([empty_dates, empty_assets]), ) # resolved_assets = array(self._finder.retrieve_all(assets)) # dates_kept = repeat_last_axis(dates.values, len(symbols))[mask] # assets_kept = repeat_first_axis(resolved_assets, len(dates))[mask] assets_kept = repeat_first_axis(symbols, len(dates))[mask] final_columns = {} for name in data: # Each term that computed an output has its postprocess method # called on the filtered result. # # As of Mon May 2 15:38:47 2016, we only use this to convert # LabelArrays into categoricals. final_columns[name] = terms[name].postprocess(data[name][mask]) return DataFrame( data=final_columns, index=pd.Index(assets_kept), # index=MultiIndex.from_arrays([dates_kept, assets_kept]), )