def load_adjusted_array(self, columns, dates, assets, mask): data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) raw = ffill_query_in_range( self._expr, lower_dt, upper_dt, self._odo_kwargs, ) sids = raw.loc[:, SID_FIELD_NAME] raw.drop( sids[~sids.isin(assets)].index, inplace=True ) if data_query_time is not None: normalize_timestamp_to_query_time( raw, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) gb = raw.groupby(SID_FIELD_NAME) return self.concrete_loader( dates, self.prepare_data(raw, gb), dataset=self._dataset, ).load_adjusted_array(columns, dates, assets, mask)
def load_raw_data(assets, dates, data_query_time, data_query_tz, expr, odo_kwargs, checkpoints=None): """ Given an expression representing data to load, perform normalization and forward-filling and return the data, materialized. Only accepts data with a `sid` field. Parameters ---------- assets : pd.int64index the assets to load data for. dates : pd.datetimeindex the simulation dates to load data for. data_query_time : datetime.time the time used as cutoff for new information. data_query_tz : tzinfo the timezone to normalize your dates to before comparing against `time`. expr : expr the expression representing the data to load. odo_kwargs : dict extra keyword arguments to pass to odo when executing the expression. checkpoints : expr, optional the expression representing the checkpointed data for `expr`. Returns ------- raw : pd.dataframe The result of computing expr and materializing the result as a dataframe. """ lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) raw = ffill_query_in_range( expr, lower_dt, upper_dt, checkpoints=checkpoints, odo_kwargs=odo_kwargs, ) sids = raw[SID_FIELD_NAME] raw.drop(sids[~sids.isin(assets)].index, inplace=True) if data_query_time is not None: normalize_timestamp_to_query_time( raw, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) return raw
def load_adjusted_array(self, columns, dates, assets, mask): data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) raw = ffill_query_in_range( self._expr, lower_dt, upper_dt, self._odo_kwargs, ) sids = raw.loc[:, SID_FIELD_NAME] raw.drop( sids[~sids.isin(assets)].index, inplace=True ) if data_query_time is not None: normalize_timestamp_to_query_time( raw, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) gb = raw.groupby(SID_FIELD_NAME) def mkseries(idx, raw_loc=raw.loc): vs = raw_loc[ idx, [TS_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME] ].values return pd.Series( index=pd.DatetimeIndex(vs[:, 0]), data=vs[:, 1], ) return EarningsCalendarLoader( dates, valmap(mkseries, gb.groups), dataset=self._dataset, ).load_adjusted_array(columns, dates, assets, mask)
def load_adjusted_array(self, columns, dates, assets, mask): data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) raw = ffill_query_in_range( self._expr, lower_dt, upper_dt, self._odo_kwargs, ) sids = raw.loc[:, SID_FIELD_NAME] raw.drop( sids[~sids.isin(assets)].index, inplace=True ) if data_query_time is not None: normalize_timestamp_to_query_time( raw, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) return EventsLoader( events=raw, next_value_columns=self._next_value_columns, previous_value_columns=self._previous_value_columns, ).load_adjusted_array( columns, dates, assets, mask, )
def test_normalize_to_query_time(self, expected, tz, dates): # Order matters in pandas 0.18.2. Prior to that, using tz_convert on # a DatetimeIndex with DST/EST timestamps mixed resulted in some of # them being an hour off (1 hour past midnight). for scrambler in self.combos: df = pd.DataFrame({"timestamp": dates[scrambler]}) result = normalize_timestamp_to_query_time(df, time(8, 45), tz, inplace=False, ts_field='timestamp') timestamps = result['timestamp'].values check_arrays(np.sort(timestamps), np.sort(expected[scrambler]))
def test_normalize_to_query_time(self, expected, tz, dates): # Order matters in pandas 0.18.2. Prior to that, using tz_convert on # a DatetimeIndex with DST/EST timestamps mixed resulted in some of # them being an hour off (1 hour past midnight). for scrambler in self.combos: df = pd.DataFrame({"timestamp": dates[scrambler]}) result = normalize_timestamp_to_query_time(df, time(8, 45), tz, inplace=False, ts_field='timestamp') timestamps = result['timestamp'].values check_arrays(np.sort(timestamps), np.sort(expected[scrambler]))
def load_adjusted_array(self, columns, dates, assets, mask): data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) raw = ffill_query_in_range( self._expr, lower_dt, upper_dt, self._odo_kwargs, ) sids = raw.loc[:, SID_FIELD_NAME] raw.drop(sids[~sids.isin(assets)].index, inplace=True) if data_query_time is not None: normalize_timestamp_to_query_time( raw, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) return EventsLoader( events=raw, next_value_columns=self._next_value_columns, previous_value_columns=self._previous_value_columns, ).load_adjusted_array( columns, dates, assets, mask, )
def _load_dataset(self, dates, assets, mask, columns): try: (dataset,) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, checkpoints, odo_kwargs = self[dataset] have_sids = (dataset.ndim == 2) asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + ( [SID_FIELD_NAME] if have_sids else [] ) colnames = added_query_fields + list(map(getname, columns)) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] <= upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) if checkpoints is not None: ts = checkpoints[TS_FIELD_NAME] checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp) if pd.isnull(checkpoints_ts): materialized_checkpoints = pd.DataFrame(columns=colnames) lower = None else: materialized_checkpoints = odo( checkpoints[ts == checkpoints_ts][colnames], pd.DataFrame, **odo_kwargs ) lower = checkpoints_ts else: materialized_checkpoints = pd.DataFrame(columns=colnames) lower = None materialized_expr = collect_expr(expr, lower) if materialized_checkpoints is not None: materialized_expr = pd.concat( ( materialized_checkpoints, materialized_expr, ), ignore_index=True, copy=False, ) materialized_deltas = ( collect_expr(deltas, lower) if deltas is not None else pd.DataFrame(columns=colnames) ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets) ] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[ :, TS_FIELD_NAME ].astype('datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) def last_in_date_group(df, reindex, have_sids=have_sids): idx = dates[dates.searchsorted( df[TS_FIELD_NAME].values.astype('datetime64[D]') )] if have_sids: idx = [idx, SID_FIELD_NAME] last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby( idx, sort=False, ).last() if have_sids: last_in_group = last_in_group.unstack() if reindex: if have_sids: cols = last_in_group.columns last_in_group = last_in_group.reindex( index=dates, columns=pd.MultiIndex.from_product( (cols.levels[0], assets), names=cols.names, ), ) else: last_in_group = last_in_group.reindex(dates) return last_in_group sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False) dense_output = last_in_date_group(sparse_output, reindex=True) dense_output.ffill(inplace=True) # Fill in missing values specified by each column. This is made # significantly more complex by the fact that we need to work around # two pandas issues: # 1) When we have sids, if there are no records for a given sid for any # dates, pandas will generate a column full of NaNs for that sid. # This means that some of the columns in `dense_output` are now # float instead of the intended dtype, so we have to coerce back to # our expected type and convert NaNs into the desired missing value. # 2) DataFrame.ffill assumes that receiving None as a fill-value means # that no value was passed. Consequently, there's no way to tell # pandas to replace NaNs in an object column with None using fillna, # so we have to roll our own instead using df.where. for column in columns: # Special logic for strings since `fillna` doesn't work if the # missing value is `None`. if column.dtype == categorical_dtype: dense_output[column.name] = dense_output[ column.name ].where(pd.notnull(dense_output[column.name]), column.missing_value) else: # We need to execute `fillna` before `astype` in case the # column contains NaNs and needs to be cast to bool or int. # This is so that the NaNs are replaced first, since pandas # can't convert NaNs for those types. dense_output[column.name] = dense_output[ column.name ].fillna(column.missing_value).astype(column.dtype) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # If we do not have sids, use the column view to make a single # column vector which is unassociated with any assets. column_view = op.itemgetter(np.s_[:, np.newaxis]) adjustments_from_deltas = adjustments_from_deltas_no_sids mask = np.full( shape=(len(mask), 1), fill_value=True, dtype=bool_dtype, ) for column_idx, column in enumerate(columns): column_name = column.name yield column, AdjustedArray( column_view( dense_output[column_name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column_name, asset_idx, sparse_deltas, ), column.missing_value, )
def _load_dataset(self, dates, assets, mask, columns): try: (dataset,) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, odo_kwargs = self[dataset] have_sids = SID_FIELD_NAME in expr.fields asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + ( [SID_FIELD_NAME] if have_sids else [] ) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def where(e, column): """Create the query to run against the resources. Parameters ---------- e : Expr The baseline or deltas expression. column : BoundColumn The column to query for. Returns ------- q : Expr The query to run for the given column. """ colname = column.name pred = e[TS_FIELD_NAME] <= lower_dt schema = e[colname].schema.measure if isinstance(schema, Option): pred &= e[colname].notnull() schema = schema.ty if schema in floating: pred &= ~e[colname].isnan() filtered = e[pred] lower = filtered.timestamp.max() if have_sids: # If we have sids, then we need to take the earliest of the # greatest date that has a non-null value by sid. lower = bz.by( filtered[SID_FIELD_NAME], timestamp=lower, ).timestamp.min() lower = odo(lower, pd.Timestamp) if lower is pd.NaT: # If there is no lower date, just query for data in he date # range. It must all be null anyways. lower = lower_dt return e[ (e[TS_FIELD_NAME] >= lower) & (e[TS_FIELD_NAME] <= upper_dt) ][added_query_fields + [colname]] def collect_expr(e): """Execute and merge all of the per-column subqueries. Parameters ---------- e : Expr The baseline or deltas expression. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ return sort_values(reduce( partial(pd.merge, on=added_query_fields, how='outer'), ( odo(where(e, column), pd.DataFrame, **odo_kwargs) for column in columns ), ), TS_FIELD_NAME) # sort for the groupby later materialized_expr = collect_expr(expr) materialized_deltas = ( collect_expr(deltas) if deltas is not None else pd.DataFrame( columns=added_query_fields + list(map(getname, columns)), ) ) if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[ :, TS_FIELD_NAME ].astype('datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) def last_in_date_group(df, reindex, have_sids=have_sids): idx = dates[dates.searchsorted( df[TS_FIELD_NAME].values.astype('datetime64[D]') )] if have_sids: idx = [idx, SID_FIELD_NAME] last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby( idx, sort=False, ).last() if have_sids: last_in_group = last_in_group.unstack() if reindex: if have_sids: cols = last_in_group.columns last_in_group = last_in_group.reindex( index=dates, columns=pd.MultiIndex.from_product( (cols.levels[0], assets), names=cols.names, ), ) else: last_in_group = last_in_group.reindex(dates) return last_in_group sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False) dense_output = last_in_date_group(sparse_output, reindex=True) dense_output.ffill(inplace=True) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # We use the column view to make an array per asset. column_view = compose( # We need to copy this because we need a concrete ndarray. # The `repeat_last_axis` call will give us a fancy strided # array which uses a buffer to represent `len(assets)` columns. # The engine puts nans at the indicies for which we do not have # sid information so that the nan-aware reductions still work. # A future change to the engine would be to add first class # support for macro econimic datasets. copy, partial(repeat_last_axis, count=len(assets)), ) adjustments_from_deltas = adjustments_from_deltas_no_sids for column_idx, column in enumerate(columns): column_name = column.name yield column, AdjustedArray( column_view( dense_output[column_name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column_name, asset_idx, sparse_deltas, ) )
def _load_dataset(self, dates, assets, mask, columns): try: (dataset, ) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, checkpoints, odo_kwargs, apply_deltas_adjustments = self[ dataset] have_sids = (dataset.ndim == 2) asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME } | ({SID_FIELD_NAME} if have_sids else set()) requested_columns = set(map(getname, columns)) colnames = sorted(added_query_fields | requested_columns) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] <= upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) lower, materialized_checkpoints = get_materialized_checkpoints( checkpoints, colnames, lower_dt, odo_kwargs) materialized_expr = self.pool.apply_async(collect_expr, (expr, lower)) materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower)) if deltas is not None else pd.DataFrame(columns=colnames)) if materialized_checkpoints is not None: materialized_expr = pd.concat( ( materialized_checkpoints, materialized_expr.get(), ), ignore_index=True, copy=False, ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets)] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype( 'datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) if AD_FIELD_NAME not in requested_columns: sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) sparse_deltas = last_in_date_group(non_novel_deltas, dates, assets, reindex=False, have_sids=have_sids) dense_output = last_in_date_group(sparse_output, dates, assets, reindex=True, have_sids=have_sids) ffill_across_cols(dense_output, columns, {c.name: c.name for c in columns}) # By default, no non-novel deltas are applied. def no_adjustments_from_deltas(*args): return {} adjustments_from_deltas = no_adjustments_from_deltas if have_sids: if apply_deltas_adjustments: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # If we do not have sids, use the column view to make a single # column vector which is unassociated with any assets. column_view = op.itemgetter(np.s_[:, np.newaxis]) if apply_deltas_adjustments: adjustments_from_deltas = adjustments_from_deltas_no_sids mask = np.full( shape=(len(mask), 1), fill_value=True, dtype=bool_dtype, ) return { column: AdjustedArray( column_view( dense_output[column.name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column.name, asset_idx, sparse_deltas, ), column.missing_value, ) for column_idx, column in enumerate(columns) }
def _load_dataset(self, dates, assets, mask, columns): try: (dataset, ) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, checkpoints, odo_kwargs = self[dataset] have_sids = SID_FIELD_NAME in expr.fields asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME ] + ([SID_FIELD_NAME] if have_sids else []) colnames = added_query_fields + list(map(getname, columns)) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] <= upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) if checkpoints is not None: ts = checkpoints[TS_FIELD_NAME] checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp) if pd.isnull(checkpoints_ts): materialized_checkpoints = pd.DataFrame(columns=colnames) lower = None else: materialized_checkpoints = odo( checkpoints[ts == checkpoints_ts][colnames], pd.DataFrame, **odo_kwargs) lower = checkpoints_ts else: materialized_checkpoints = pd.DataFrame(columns=colnames) lower = None materialized_expr = collect_expr(expr, lower) if materialized_checkpoints is not None: materialized_expr = pd.concat( ( materialized_checkpoints, materialized_expr, ), ignore_index=True, copy=False, ) materialized_deltas = (collect_expr(deltas, lower) if deltas is not None else pd.DataFrame(columns=colnames)) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets)] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype( 'datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) def last_in_date_group(df, reindex, have_sids=have_sids): idx = dates[dates.searchsorted( df[TS_FIELD_NAME].values.astype('datetime64[D]'))] if have_sids: idx = [idx, SID_FIELD_NAME] last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby( idx, sort=False, ).last() if have_sids: last_in_group = last_in_group.unstack() if reindex: if have_sids: cols = last_in_group.columns last_in_group = last_in_group.reindex( index=dates, columns=pd.MultiIndex.from_product( (cols.levels[0], assets), names=cols.names, ), ) else: last_in_group = last_in_group.reindex(dates) return last_in_group sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False) dense_output = last_in_date_group(sparse_output, reindex=True) dense_output.ffill(inplace=True) # Fill in missing values specified by each column. This is made # significantly more complex by the fact that we need to work around # two pandas issues: # 1) When we have sids, if there are no records for a given sid for any # dates, pandas will generate a column full of NaNs for that sid. # This means that some of the columns in `dense_output` are now # float instead of the intended dtype, so we have to coerce back to # our expected type and convert NaNs into the desired missing value. # 2) DataFrame.ffill assumes that receiving None as a fill-value means # that no value was passed. Consequently, there's no way to tell # pandas to replace NaNs in an object column with None using fillna, # so we have to roll our own instead using df.where. for column in columns: # Special logic for strings since `fillna` doesn't work if the # missing value is `None`. if column.dtype == categorical_dtype: dense_output[column.name] = dense_output[column.name].where( pd.notnull(dense_output[column.name]), column.missing_value) else: # We need to execute `fillna` before `astype` in case the # column contains NaNs and needs to be cast to bool or int. # This is so that the NaNs are replaced first, since pandas # can't convert NaNs for those types. dense_output[column.name] = dense_output[column.name].fillna( column.missing_value).astype(column.dtype) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # We use the column view to make an array per asset. column_view = compose( # We need to copy this because we need a concrete ndarray. # The `repeat_last_axis` call will give us a fancy strided # array which uses a buffer to represent `len(assets)` columns. # The engine puts nans at the indicies for which we do not have # sid information so that the nan-aware reductions still work. # A future change to the engine would be to add first class # support for macro econimic datasets. copy, partial(repeat_last_axis, count=len(assets)), ) adjustments_from_deltas = adjustments_from_deltas_no_sids for column_idx, column in enumerate(columns): column_name = column.name yield column, AdjustedArray( column_view( dense_output[column_name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column_name, asset_idx, sparse_deltas, ), column.missing_value, )
def _load_dataset(self, dates, assets, mask, columns): try: (dataset,) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, odo_kwargs = self[dataset] have_sids = SID_FIELD_NAME in expr.fields asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + ( [SID_FIELD_NAME] if have_sids else [] ) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def where(e): """Create the query to run against the resources. Parameters ---------- e : Expr The baseline or deltas expression. Returns ------- q : Expr The query to run. """ def lower_for_col(column): pred = e[TS_FIELD_NAME] <= lower_dt colname = column.name schema = e[colname].schema.measure if isinstance(schema, Option): pred &= e[colname].notnull() schema = schema.ty if schema in floating: pred &= ~e[colname].isnan() filtered = e[pred] lower = filtered[TS_FIELD_NAME].max() if have_sids: # If we have sids, then we need to take the earliest of the # greatest date that has a non-null value by sid. lower = bz.by( filtered[SID_FIELD_NAME], timestamp=lower, ).timestamp.min() return lower lower = odo( reduce( bz.least, map(lower_for_col, columns), ), pd.Timestamp, **odo_kwargs ) if lower is pd.NaT: lower = lower_dt return e[ (e[TS_FIELD_NAME] >= lower) & (e[TS_FIELD_NAME] <= upper_dt) ][added_query_fields + list(map(getname, columns))] def collect_expr(e): """Execute and merge all of the per-column subqueries. Parameters ---------- e : Expr The baseline or deltas expression. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ df = odo(where(e), pd.DataFrame, **odo_kwargs) df.sort(TS_FIELD_NAME, inplace=True) # sort for the groupby later return df materialized_expr = collect_expr(expr) materialized_deltas = ( collect_expr(deltas) if deltas is not None else pd.DataFrame( columns=added_query_fields + list(map(getname, columns)), ) ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets) ] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[ :, TS_FIELD_NAME ].astype('datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) def last_in_date_group(df, reindex, have_sids=have_sids): idx = dates[dates.searchsorted( df[TS_FIELD_NAME].values.astype('datetime64[D]') )] if have_sids: idx = [idx, SID_FIELD_NAME] last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby( idx, sort=False, ).last() if have_sids: last_in_group = last_in_group.unstack() if reindex: if have_sids: cols = last_in_group.columns last_in_group = last_in_group.reindex( index=dates, columns=pd.MultiIndex.from_product( (cols.levels[0], assets), names=cols.names, ), ) else: last_in_group = last_in_group.reindex(dates) return last_in_group sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False) dense_output = last_in_date_group(sparse_output, reindex=True) dense_output.ffill(inplace=True) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # We use the column view to make an array per asset. column_view = compose( # We need to copy this because we need a concrete ndarray. # The `repeat_last_axis` call will give us a fancy strided # array which uses a buffer to represent `len(assets)` columns. # The engine puts nans at the indicies for which we do not have # sid information so that the nan-aware reductions still work. # A future change to the engine would be to add first class # support for macro econimic datasets. copy, partial(repeat_last_axis, count=len(assets)), ) adjustments_from_deltas = adjustments_from_deltas_no_sids for column_idx, column in enumerate(columns): column_name = column.name yield column, AdjustedArray( column_view( dense_output[column_name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column_name, asset_idx, sparse_deltas, ), column.missing_value, )
def _load_dataset(self, dates, assets, mask, columns): try: (dataset,) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, resources = self[dataset] have_sids = SID_FIELD_NAME in expr.fields assets = list(map(int, assets)) # coerce from numpy.int64 fields = list(map(dataset_name, columns)) query_fields = fields + [AD_FIELD_NAME, TS_FIELD_NAME] + ( [SID_FIELD_NAME] if have_sids else [] ) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def where(e): """Create the query to run against the resources. Parameters ---------- e : Expr The baseline or deltas expression. Returns ------- q : Expr The query to run. """ ts = e[TS_FIELD_NAME] # Hack to get the lower bound to query: # This must be strictly executed because the data for `ts` will # be removed from scope too early otherwise. lower = odo(ts[ts <= lower_dt].max(), pd.Timestamp) selection = ts <= upper_dt if have_sids: selection &= e[SID_FIELD_NAME].isin(assets) if lower is not pd.NaT: selection &= ts >= lower return e[selection][query_fields] extra_kwargs = {'d': resources} if resources else {} materialized_expr = odo(where(expr), pd.DataFrame, **extra_kwargs) materialized_deltas = ( odo(where(deltas), pd.DataFrame, **extra_kwargs) if deltas is not None else pd.DataFrame(columns=query_fields) ) if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[ :, TS_FIELD_NAME ].astype('datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) if have_sids: # Unstack by the sid so that we get a multi-index on the columns # of datacolumn, sid. sparse_output = sparse_output.set_index( [TS_FIELD_NAME, SID_FIELD_NAME], ).unstack() sparse_deltas = non_novel_deltas.set_index( [TS_FIELD_NAME, SID_FIELD_NAME], ).unstack() dense_output = sparse_output.reindex(dates, method='ffill') cols = dense_output.columns dense_output = dense_output.reindex( columns=pd.MultiIndex.from_product( (cols.levels[0], assets), names=cols.names, ), ) adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # We use the column view to make an array per asset. column_view = compose( # We need to copy this because we need a concrete ndarray. # The `repeat_last_axis` call will give us a fancy strided # array which uses a buffer to represent `len(assets)` columns. # The engine puts nans at the indicies for which we do not have # sid information so that the nan-aware reductions still work. # A future change to the engine would be to add first class # support for macro econimic datasets. copy, partial(repeat_last_axis, count=len(assets)), ) sparse_output = sparse_output.set_index(TS_FIELD_NAME) dense_output = sparse_output.reindex(dates, method='ffill') sparse_deltas = non_novel_deltas.set_index(TS_FIELD_NAME) adjustments_from_deltas = adjustments_from_deltas_no_sids for column_idx, column in enumerate(columns): column_name = column.name yield column, AdjustedArray( column_view( dense_output[column_name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output.index, column_idx, column_name, assets, sparse_deltas, ) )
def _load_dataset(self, dates, assets, mask, columns): try: (dataset,) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, checkpoints, odo_kwargs = self[dataset] have_sids = (dataset.ndim == 2) asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + ( [SID_FIELD_NAME] if have_sids else [] ) colnames = added_query_fields + list(map(getname, columns)) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] <= upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) lower, materialized_checkpoints = get_materialized_checkpoints( checkpoints, colnames, lower_dt, odo_kwargs ) materialized_expr = self.pool.apply_async(collect_expr, (expr, lower)) materialized_deltas = ( self.pool.apply(collect_expr, (deltas, lower)) if deltas is not None else pd.DataFrame(columns=colnames) ) if materialized_checkpoints is not None: materialized_expr = pd.concat( ( materialized_checkpoints, materialized_expr.get(), ), ignore_index=True, copy=False, ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets) ] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[ :, TS_FIELD_NAME ].astype('datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) sparse_deltas = last_in_date_group(non_novel_deltas, dates, assets, reindex=False, have_sids=have_sids) dense_output = last_in_date_group(sparse_output, dates, assets, reindex=True, have_sids=have_sids) ffill_across_cols(dense_output, columns, {c.name: c.name for c in columns}) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # If we do not have sids, use the column view to make a single # column vector which is unassociated with any assets. column_view = op.itemgetter(np.s_[:, np.newaxis]) adjustments_from_deltas = adjustments_from_deltas_no_sids mask = np.full( shape=(len(mask), 1), fill_value=True, dtype=bool_dtype, ) return { column: AdjustedArray( column_view( dense_output[column.name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column.name, asset_idx, sparse_deltas, ), column.missing_value, ) for column_idx, column in enumerate(columns) }
def _load_dataset(self, dates, assets, mask, columns): try: (dataset,) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, odo_kwargs = self[dataset] have_sids = SID_FIELD_NAME in expr.fields asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + ( [SID_FIELD_NAME] if have_sids else [] ) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def where(e): """Create the query to run against the resources. Parameters ---------- e : Expr The baseline or deltas expression. Returns ------- q : Expr The query to run. """ def lower_for_col(column): pred = e[TS_FIELD_NAME] <= lower_dt colname = column.name schema = e[colname].schema.measure if isinstance(schema, Option): pred &= e[colname].notnull() schema = schema.ty if schema in floating: pred &= ~e[colname].isnan() filtered = e[pred] lower = filtered[TS_FIELD_NAME].max() if have_sids: # If we have sids, then we need to take the earliest of the # greatest date that has a non-null value by sid. lower = bz.by( filtered[SID_FIELD_NAME], timestamp=lower, ).timestamp.min() return lower lower = odo( reduce( bz.least, map(lower_for_col, columns), ), pd.Timestamp, **odo_kwargs ) if lower is pd.NaT: lower = lower_dt return e[ (e[TS_FIELD_NAME] >= lower) & (e[TS_FIELD_NAME] <= upper_dt) ][added_query_fields + list(map(getname, columns))] def collect_expr(e): """Execute and merge all of the per-column subqueries. Parameters ---------- e : Expr The baseline or deltas expression. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ df = odo(where(e), pd.DataFrame, **odo_kwargs) df.sort(TS_FIELD_NAME, inplace=True) # sort for the groupby later return df materialized_expr = collect_expr(expr) materialized_deltas = ( collect_expr(deltas) if deltas is not None else pd.DataFrame( columns=added_query_fields + list(map(getname, columns)), ) ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets) ] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[ :, TS_FIELD_NAME ].astype('datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) def last_in_date_group(df, reindex, have_sids=have_sids): idx = dates[dates.searchsorted( df[TS_FIELD_NAME].values.astype('datetime64[D]') )] if have_sids: idx = [idx, SID_FIELD_NAME] last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby( idx, sort=False, ).last() if have_sids: last_in_group = last_in_group.unstack() if reindex: if have_sids: cols = last_in_group.columns last_in_group = last_in_group.reindex( index=dates, columns=pd.MultiIndex.from_product( (cols.levels[0], assets), names=cols.names, ), ) else: last_in_group = last_in_group.reindex(dates) return last_in_group sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False) dense_output = last_in_date_group(sparse_output, reindex=True) dense_output.ffill(inplace=True) # Fill in missing values specified by each column. This is made # significantly more complex by the fact that we need to work around # two pandas issues: # 1) When we have sids, if there are no records for a given sid for any # dates, pandas will generate a column full of NaNs for that sid. # This means that some of the columns in `dense_output` are now # float instead of the intended dtype, so we have to coerce back to # our expected type and convert NaNs into the desired missing value. # 2) DataFrame.ffill assumes that receiving None as a fill-value means # that no value was passed. Consequently, there's no way to tell # pandas to replace NaNs in an object column with None using fillna, # so we have to roll our own instead using df.where. for column in columns: # Special logic for strings since `fillna` doesn't work if the # missing value is `None`. if column.dtype == categorical_dtype: dense_output[column.name] = dense_output[ column.name ].where(pd.notnull(dense_output[column.name]), column.missing_value) else: # We need to execute `fillna` before `astype` in case the # column contains NaNs and needs to be cast to bool or int. # This is so that the NaNs are replaced first, since pandas # can't convert NaNs for those types. dense_output[column.name] = dense_output[ column.name ].fillna(column.missing_value).astype(column.dtype) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # We use the column view to make an array per asset. column_view = compose( # We need to copy this because we need a concrete ndarray. # The `repeat_last_axis` call will give us a fancy strided # array which uses a buffer to represent `len(assets)` columns. # The engine puts nans at the indicies for which we do not have # sid information so that the nan-aware reductions still work. # A future change to the engine would be to add first class # support for macro econimic datasets. copy, partial(repeat_last_axis, count=len(assets)), ) adjustments_from_deltas = adjustments_from_deltas_no_sids for column_idx, column in enumerate(columns): column_name = column.name yield column, AdjustedArray( column_view( dense_output[column_name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column_name, asset_idx, sparse_deltas, ), column.missing_value, )
def load_raw_data(assets, dates, data_query_time, data_query_tz, expr, odo_kwargs, checkpoints=None): """ Given an expression representing data to load, perform normalization and forward-filling and return the data, materialized. Only accepts data with a `sid` field. Parameters ---------- assets : pd.int64index the assets to load data for. dates : pd.datetimeindex the simulation dates to load data for. data_query_time : datetime.time the time used as cutoff for new information. data_query_tz : tzinfo the timezone to normalize your dates to before comparing against `time`. expr : expr the expression representing the data to load. odo_kwargs : dict extra keyword arguments to pass to odo when executing the expression. checkpoints : expr, optional the expression representing the checkpointed data for `expr`. Returns ------- raw : pd.dataframe The result of computing expr and materializing the result as a dataframe. """ lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) raw = ffill_query_in_range( expr, lower_dt, upper_dt, checkpoints=checkpoints, odo_kwargs=odo_kwargs, ) sids = raw[SID_FIELD_NAME] raw.drop( sids[~sids.isin(assets)].index, inplace=True ) if data_query_time is not None: normalize_timestamp_to_query_time( raw, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) return raw