def get_last_data_per_qtr(self, assets_with_data, columns, dates): """ Determine the last piece of information we know for each column on each date in the index for each sid and quarter. Parameters ---------- assets_with_data : pd.Index Index of all assets that appear in the raw data given to the loader. columns : iterable of BoundColumn The columns that need to be loaded from the raw data. dates : pd.DatetimeIndex The calendar of dates for which data should be loaded. Returns ------- stacked_last_per_qtr : pd.DataFrame A DataFrame indexed by [dates, sid, normalized_quarters] that has the latest information for each row of the index, sorted by event date. last_per_qtr : pd.DataFrame A DataFrame with columns that are a MultiIndex of [ self.estimates.columns, normalized_quarters, sid]. """ # Get a DataFrame indexed by date with a MultiIndex of columns of [ # self.estimates.columns, normalized_quarters, sid], where each cell # contains the latest data for that day. last_per_qtr = last_in_date_group( self.estimates, dates, assets_with_data, reindex=True, extra_groupers=[NORMALIZED_QUARTERS], ) # Forward fill values for each quarter/sid/dataset column. ffill_across_cols(last_per_qtr, columns, self.name_map) # Stack quarter and sid into the index. stacked_last_per_qtr = last_per_qtr.stack( [SID_FIELD_NAME, NORMALIZED_QUARTERS], ) # Set date index name for ease of reference stacked_last_per_qtr.index.set_names( SIMULATION_DATES, level=0, inplace=True, ) stacked_last_per_qtr = stacked_last_per_qtr.sort( EVENT_DATE_FIELD_NAME, ) stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] = pd.to_datetime( stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] ) return last_per_qtr, stacked_last_per_qtr
def get_last_data_per_qtr(self, assets_with_data, columns, dates): """ Determine the last piece of information we know for each column on each date in the index for each sid and quarter. Parameters ---------- assets_with_data : pd.Index Index of all assets that appear in the raw data given to the loader. columns : iterable of BoundColumn The columns that need to be loaded from the raw data. dates : pd.DatetimeIndex The calendar of dates for which data should be loaded. Returns ------- stacked_last_per_qtr : pd.DataFrame A DataFrame indexed by [dates, sid, normalized_quarters] that has the latest information for each row of the index, sorted by event date. last_per_qtr : pd.DataFrame A DataFrame with columns that are a MultiIndex of [ self.estimates.columns, normalized_quarters, sid]. """ # Get a DataFrame indexed by date with a MultiIndex of columns of [ # self.estimates.columns, normalized_quarters, sid], where each cell # contains the latest data for that day. last_per_qtr = last_in_date_group( self.estimates, dates, assets_with_data, reindex=True, extra_groupers=[NORMALIZED_QUARTERS], ) # Forward fill values for each quarter/sid/dataset column. ffill_across_cols(last_per_qtr, columns, self.name_map) # Stack quarter and sid into the index. stacked_last_per_qtr = last_per_qtr.stack( [SID_FIELD_NAME, NORMALIZED_QUARTERS], ) # Set date index name for ease of reference stacked_last_per_qtr.index.set_names( SIMULATION_DATES, level=0, inplace=True, ) stacked_last_per_qtr = stacked_last_per_qtr.sort( EVENT_DATE_FIELD_NAME, ) stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] = pd.to_datetime( stacked_last_per_qtr[EVENT_DATE_FIELD_NAME]) return last_per_qtr, stacked_last_per_qtr
def _load_dataset(self, dates, assets, mask, columns): try: (dataset, ) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, checkpoints, odo_kwargs, apply_deltas_adjustments = self[ dataset] have_sids = (dataset.ndim == 2) asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME } | ({SID_FIELD_NAME} if have_sids else set()) requested_columns = set(map(getname, columns)) colnames = sorted(added_query_fields | requested_columns) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] <= upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) lower, materialized_checkpoints = get_materialized_checkpoints( checkpoints, colnames, lower_dt, odo_kwargs) materialized_expr = self.pool.apply_async(collect_expr, (expr, lower)) materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower)) if deltas is not None else pd.DataFrame(columns=colnames)) if materialized_checkpoints is not None: materialized_expr = pd.concat( ( materialized_checkpoints, materialized_expr.get(), ), ignore_index=True, copy=False, ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets)] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype( 'datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) if AD_FIELD_NAME not in requested_columns: sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) sparse_deltas = last_in_date_group(non_novel_deltas, dates, assets, reindex=False, have_sids=have_sids) dense_output = last_in_date_group(sparse_output, dates, assets, reindex=True, have_sids=have_sids) ffill_across_cols(dense_output, columns, {c.name: c.name for c in columns}) # By default, no non-novel deltas are applied. def no_adjustments_from_deltas(*args): return {} adjustments_from_deltas = no_adjustments_from_deltas if have_sids: if apply_deltas_adjustments: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # If we do not have sids, use the column view to make a single # column vector which is unassociated with any assets. column_view = op.itemgetter(np.s_[:, np.newaxis]) if apply_deltas_adjustments: adjustments_from_deltas = adjustments_from_deltas_no_sids mask = np.full( shape=(len(mask), 1), fill_value=True, dtype=bool_dtype, ) return { column: AdjustedArray( column_view( dense_output[column.name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column.name, asset_idx, sparse_deltas, ), column.missing_value, ) for column_idx, column in enumerate(columns) }
def _load_dataset(self, dates, assets, mask, columns): try: (dataset,) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, checkpoints, odo_kwargs = self[dataset] have_sids = (dataset.ndim == 2) asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + ( [SID_FIELD_NAME] if have_sids else [] ) colnames = added_query_fields + list(map(getname, columns)) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] <= upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) lower, materialized_checkpoints = get_materialized_checkpoints( checkpoints, colnames, lower_dt, odo_kwargs ) materialized_expr = self.pool.apply_async(collect_expr, (expr, lower)) materialized_deltas = ( self.pool.apply(collect_expr, (deltas, lower)) if deltas is not None else pd.DataFrame(columns=colnames) ) if materialized_checkpoints is not None: materialized_expr = pd.concat( ( materialized_checkpoints, materialized_expr.get(), ), ignore_index=True, copy=False, ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets) ] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[ :, TS_FIELD_NAME ].astype('datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) sparse_deltas = last_in_date_group(non_novel_deltas, dates, assets, reindex=False, have_sids=have_sids) dense_output = last_in_date_group(sparse_output, dates, assets, reindex=True, have_sids=have_sids) ffill_across_cols(dense_output, columns, {c.name: c.name for c in columns}) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # If we do not have sids, use the column view to make a single # column vector which is unassociated with any assets. column_view = op.itemgetter(np.s_[:, np.newaxis]) adjustments_from_deltas = adjustments_from_deltas_no_sids mask = np.full( shape=(len(mask), 1), fill_value=True, dtype=bool_dtype, ) return { column: AdjustedArray( column_view( dense_output[column.name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column.name, asset_idx, sparse_deltas, ), column.missing_value, ) for column_idx, column in enumerate(columns) }