Пример #1
0
    def test_novel_deltas_macro(self):
        asset_info = asset_infos[0][0]
        base_dates = pd.DatetimeIndex([
            pd.Timestamp('2014-01-01'),
            pd.Timestamp('2014-01-04')
        ])
        baseline = pd.DataFrame({
            'value': (0, 1),
            'asof_date': base_dates,
            'timestamp': base_dates,
        })
        expr = bz.Data(baseline, name='expr', dshape=self.macro_dshape)
        deltas = bz.Data(baseline, name='deltas', dshape=self.macro_dshape)
        deltas = bz.transform(
            deltas,
            value=deltas.value + 10,
            timestamp=deltas.timestamp + timedelta(days=1),
        )

        nassets = len(asset_info)
        expected_views = keymap(pd.Timestamp, {
            '2014-01-03': repeat_last_axis(
                np.array([10.0, 10.0, 10.0]),
                nassets,
            ),
            '2014-01-06': repeat_last_axis(
                np.array([10.0, 10.0, 11.0]),
                nassets,
            ),
        })

        cal = pd.DatetimeIndex([
            pd.Timestamp('2014-01-01'),
            pd.Timestamp('2014-01-02'),
            pd.Timestamp('2014-01-03'),
            # omitting the 4th and 5th to simulate a weekend
            pd.Timestamp('2014-01-06'),
        ])
        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                list(concatv([10] * nassets, [11] * nassets)),
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value',),
            )
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=cal,
                start=cal[2],
                end=cal[-1],
                window_length=3,
                compute_fn=op.itemgetter(-1),
            )
Пример #2
0
    def test_novel_deltas(self, asset_info):
        base_dates = pd.DatetimeIndex([pd.Timestamp("2014-01-01"), pd.Timestamp("2014-01-04")])
        repeated_dates = base_dates.repeat(3)
        baseline = pd.DataFrame(
            {
                "sid": self.sids * 2,
                "value": (0, 1, 2, 1, 2, 3),
                "asof_date": repeated_dates,
                "timestamp": repeated_dates,
            }
        )
        expr = bz.Data(baseline, name="expr", dshape=self.dshape)
        deltas = bz.Data(baseline, name="deltas", dshape=self.dshape)
        deltas = bz.transform(deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1))
        expected_views = keymap(
            pd.Timestamp,
            {
                "2014-01-03": np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [10.0, 11.0, 12.0]]),
                "2014-01-06": np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [11.0, 12.0, 13.0]]),
            },
        )
        if len(asset_info) == 4:
            expected_views = valmap(lambda view: np.c_[view, [np.nan, np.nan, np.nan]], expected_views)
            expected_output_buffer = [10, 11, 12, np.nan, 11, 12, 13, np.nan]
        else:
            expected_output_buffer = [10, 11, 12, 11, 12, 13]

        cal = pd.DatetimeIndex(
            [
                pd.Timestamp("2014-01-01"),
                pd.Timestamp("2014-01-02"),
                pd.Timestamp("2014-01-03"),
                # omitting the 4th and 5th to simulate a weekend
                pd.Timestamp("2014-01-06"),
            ]
        )

        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                expected_output_buffer,
                index=pd.MultiIndex.from_product(
                    (sorted(expected_views.keys()), finder.retrieve_all(asset_info.index))
                ),
                columns=("value",),
            )
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=cal,
                start=cal[2],
                end=cal[-1],
                window_length=3,
                compute_fn=op.itemgetter(-1),
            )
Пример #3
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset,) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, checkpoints, odo_kwargs = self[dataset]
        have_sids = (dataset.ndim == 2)
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + (
            [SID_FIELD_NAME] if have_sids else []
        )
        colnames = added_query_fields + list(map(getname, columns))

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] <= upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        if checkpoints is not None:
            ts = checkpoints[TS_FIELD_NAME]
            checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp)
            if pd.isnull(checkpoints_ts):
                materialized_checkpoints = pd.DataFrame(columns=colnames)
                lower = None
            else:
                materialized_checkpoints = odo(
                    checkpoints[ts == checkpoints_ts][colnames],
                    pd.DataFrame,
                    **odo_kwargs
                )
                lower = checkpoints_ts
        else:
            materialized_checkpoints = pd.DataFrame(columns=colnames)
            lower = None

        materialized_expr = collect_expr(expr, lower)
        if materialized_checkpoints is not None:
            materialized_expr = pd.concat(
                (
                    materialized_checkpoints,
                    materialized_expr,
                ),
                ignore_index=True,
                copy=False,
            )
        materialized_deltas = (
            collect_expr(deltas, lower)
            if deltas is not None else
            pd.DataFrame(columns=colnames)
        )

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)
            ]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[
                    :, TS_FIELD_NAME
                ].astype('datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        def last_in_date_group(df, reindex, have_sids=have_sids):
            idx = dates[dates.searchsorted(
                df[TS_FIELD_NAME].values.astype('datetime64[D]')
            )]
            if have_sids:
                idx = [idx, SID_FIELD_NAME]

            last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby(
                idx,
                sort=False,
            ).last()

            if have_sids:
                last_in_group = last_in_group.unstack()

            if reindex:
                if have_sids:
                    cols = last_in_group.columns
                    last_in_group = last_in_group.reindex(
                        index=dates,
                        columns=pd.MultiIndex.from_product(
                            (cols.levels[0], assets),
                            names=cols.names,
                        ),
                    )
                else:
                    last_in_group = last_in_group.reindex(dates)

            return last_in_group

        sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False)
        dense_output = last_in_date_group(sparse_output, reindex=True)
        dense_output.ffill(inplace=True)

        # Fill in missing values specified by each column. This is made
        # significantly more complex by the fact that we need to work around
        # two pandas issues:

        # 1) When we have sids, if there are no records for a given sid for any
        #    dates, pandas will generate a column full of NaNs for that sid.
        #    This means that some of the columns in `dense_output` are now
        #    float instead of the intended dtype, so we have to coerce back to
        #    our expected type and convert NaNs into the desired missing value.

        # 2) DataFrame.ffill assumes that receiving None as a fill-value means
        #    that no value was passed.  Consequently, there's no way to tell
        #    pandas to replace NaNs in an object column with None using fillna,
        #    so we have to roll our own instead using df.where.
        for column in columns:
            # Special logic for strings since `fillna` doesn't work if the
            # missing value is `None`.
            if column.dtype == categorical_dtype:
                dense_output[column.name] = dense_output[
                    column.name
                ].where(pd.notnull(dense_output[column.name]),
                        column.missing_value)
            else:
                # We need to execute `fillna` before `astype` in case the
                # column contains NaNs and needs to be cast to bool or int.
                # This is so that the NaNs are replaced first, since pandas
                # can't convert NaNs for those types.
                dense_output[column.name] = dense_output[
                    column.name
                ].fillna(column.missing_value).astype(column.dtype)

        if have_sids:
            adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # If we do not have sids, use the column view to make a single
            # column vector which is unassociated with any assets.
            column_view = op.itemgetter(np.s_[:, np.newaxis])

            adjustments_from_deltas = adjustments_from_deltas_no_sids
            mask = np.full(
                shape=(len(mask), 1), fill_value=True, dtype=bool_dtype,
            )

        for column_idx, column in enumerate(columns):
            column_name = column.name
            yield column, AdjustedArray(
                column_view(
                    dense_output[column_name].values.astype(column.dtype),
                ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column_name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
Пример #4
0
    def test_novel_deltas(self, asset_info):
        base_dates = pd.DatetimeIndex(
            [pd.Timestamp('2014-01-01'),
             pd.Timestamp('2014-01-04')])
        repeated_dates = base_dates.repeat(3)
        baseline = pd.DataFrame({
            'sid': self.sids * 2,
            'value': (0, 1, 2, 1, 2, 3),
            'asof_date': repeated_dates,
            'timestamp': repeated_dates,
        })
        expr = bz.Data(baseline, name='expr', dshape=self.dshape)
        deltas = bz.Data(baseline, name='deltas', dshape=self.dshape)
        deltas = bz.transform(
            deltas,
            value=deltas.value + 10,
            timestamp=deltas.timestamp + timedelta(days=1),
        )
        expected_views = keymap(
            pd.Timestamp, {
                '2014-01-03':
                np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0],
                          [10.0, 11.0, 12.0]]),
                '2014-01-06':
                np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0],
                          [11.0, 12.0, 13.0]]),
            })
        if len(asset_info) == 4:
            expected_views = valmap(
                lambda view: np.c_[view, [np.nan, np.nan, np.nan]],
                expected_views,
            )
            expected_output_buffer = [10, 11, 12, np.nan, 11, 12, 13, np.nan]
        else:
            expected_output_buffer = [10, 11, 12, 11, 12, 13]

        cal = pd.DatetimeIndex([
            pd.Timestamp('2014-01-01'),
            pd.Timestamp('2014-01-02'),
            pd.Timestamp('2014-01-03'),
            # omitting the 4th and 5th to simulate a weekend
            pd.Timestamp('2014-01-06'),
        ])

        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                expected_output_buffer,
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value', ),
            )
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=cal,
                start=cal[2],
                end=cal[-1],
                window_length=3,
                compute_fn=op.itemgetter(-1),
            )
Пример #5
0
    def test_novel_deltas_macro(self):
        asset_info = asset_infos[0][0]
        base_dates = pd.DatetimeIndex(
            [pd.Timestamp('2014-01-01'),
             pd.Timestamp('2014-01-04')])
        baseline = pd.DataFrame({
            'value': (0, 1),
            'asof_date': base_dates,
            'timestamp': base_dates,
        })
        expr = bz.Data(baseline, name='expr', dshape=self.macro_dshape)
        deltas = bz.Data(baseline, name='deltas', dshape=self.macro_dshape)
        deltas = bz.transform(
            deltas,
            value=deltas.value + 10,
            timestamp=deltas.timestamp + timedelta(days=1),
        )

        nassets = len(asset_info)
        expected_views = keymap(
            pd.Timestamp, {
                '2014-01-03':
                repeat_last_axis(
                    np.array([10.0, 10.0, 10.0]),
                    nassets,
                ),
                '2014-01-06':
                repeat_last_axis(
                    np.array([10.0, 10.0, 11.0]),
                    nassets,
                ),
            })

        cal = pd.DatetimeIndex([
            pd.Timestamp('2014-01-01'),
            pd.Timestamp('2014-01-02'),
            pd.Timestamp('2014-01-03'),
            # omitting the 4th and 5th to simulate a weekend
            pd.Timestamp('2014-01-06'),
        ])
        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                list(concatv([10] * nassets, [11] * nassets)),
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value', ),
            )
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=cal,
                start=cal[2],
                end=cal[-1],
                window_length=3,
                compute_fn=op.itemgetter(-1),
            )
Пример #6
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset, ) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, checkpoints, odo_kwargs, apply_deltas_adjustments = self[
            dataset]
        have_sids = (dataset.ndim == 2)
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME
                              } | ({SID_FIELD_NAME} if have_sids else set())
        requested_columns = set(map(getname, columns))
        colnames = sorted(added_query_fields | requested_columns)

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] <= upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        lower, materialized_checkpoints = get_materialized_checkpoints(
            checkpoints, colnames, lower_dt, odo_kwargs)

        materialized_expr = self.pool.apply_async(collect_expr, (expr, lower))
        materialized_deltas = (self.pool.apply(collect_expr,
                                               (deltas, lower)) if deltas
                               is not None else pd.DataFrame(columns=colnames))

        if materialized_checkpoints is not None:
            materialized_expr = pd.concat(
                (
                    materialized_checkpoints,
                    materialized_expr.get(),
                ),
                ignore_index=True,
                copy=False,
            )

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype(
                    'datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        if AD_FIELD_NAME not in requested_columns:
            sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        sparse_deltas = last_in_date_group(non_novel_deltas,
                                           dates,
                                           assets,
                                           reindex=False,
                                           have_sids=have_sids)
        dense_output = last_in_date_group(sparse_output,
                                          dates,
                                          assets,
                                          reindex=True,
                                          have_sids=have_sids)
        ffill_across_cols(dense_output, columns,
                          {c.name: c.name
                           for c in columns})

        # By default, no non-novel deltas are applied.
        def no_adjustments_from_deltas(*args):
            return {}

        adjustments_from_deltas = no_adjustments_from_deltas
        if have_sids:
            if apply_deltas_adjustments:
                adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # If we do not have sids, use the column view to make a single
            # column vector which is unassociated with any assets.
            column_view = op.itemgetter(np.s_[:, np.newaxis])
            if apply_deltas_adjustments:
                adjustments_from_deltas = adjustments_from_deltas_no_sids
            mask = np.full(
                shape=(len(mask), 1),
                fill_value=True,
                dtype=bool_dtype,
            )

        return {
            column: AdjustedArray(
                column_view(
                    dense_output[column.name].values.astype(column.dtype), ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column.name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
            for column_idx, column in enumerate(columns)
        }
Пример #7
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset,) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, checkpoints, odo_kwargs = self[dataset]
        have_sids = (dataset.ndim == 2)
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + (
            [SID_FIELD_NAME] if have_sids else []
        )
        colnames = added_query_fields + list(map(getname, columns))

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] <= upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        lower, materialized_checkpoints = get_materialized_checkpoints(
            checkpoints, colnames, lower_dt, odo_kwargs
        )

        materialized_expr = self.pool.apply_async(collect_expr, (expr, lower))
        materialized_deltas = (
            self.pool.apply(collect_expr, (deltas, lower))
            if deltas is not None else
            pd.DataFrame(columns=colnames)
        )

        if materialized_checkpoints is not None:
            materialized_expr = pd.concat(
                (
                    materialized_checkpoints,
                    materialized_expr.get(),
                ),
                ignore_index=True,
                copy=False,
            )

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)
            ]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[
                    :, TS_FIELD_NAME
                ].astype('datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        sparse_deltas = last_in_date_group(non_novel_deltas,
                                           dates,
                                           assets,
                                           reindex=False,
                                           have_sids=have_sids)
        dense_output = last_in_date_group(sparse_output,
                                          dates,
                                          assets,
                                          reindex=True,
                                          have_sids=have_sids)
        ffill_across_cols(dense_output, columns, {c.name: c.name
                                                  for c in columns})
        if have_sids:
            adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # If we do not have sids, use the column view to make a single
            # column vector which is unassociated with any assets.
            column_view = op.itemgetter(np.s_[:, np.newaxis])

            adjustments_from_deltas = adjustments_from_deltas_no_sids
            mask = np.full(
                shape=(len(mask), 1), fill_value=True, dtype=bool_dtype,
            )

        return {
            column: AdjustedArray(
                column_view(
                    dense_output[column.name].values.astype(column.dtype),
                ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column.name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
            for column_idx, column in enumerate(columns)
        }
Пример #8
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset, ) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, checkpoints, odo_kwargs = self[dataset]
        have_sids = (dataset.ndim == 2)
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME
                              ] + ([SID_FIELD_NAME] if have_sids else [])
        colnames = added_query_fields + list(map(getname, columns))

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] <= upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        if checkpoints is not None:
            ts = checkpoints[TS_FIELD_NAME]
            checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp)
            if pd.isnull(checkpoints_ts):
                materialized_checkpoints = pd.DataFrame(columns=colnames)
                lower = None
            else:
                materialized_checkpoints = odo(
                    checkpoints[ts == checkpoints_ts][colnames], pd.DataFrame,
                    **odo_kwargs)
                lower = checkpoints_ts
        else:
            materialized_checkpoints = pd.DataFrame(columns=colnames)
            lower = None

        materialized_expr = self.pool.apply_async(collect_expr, (expr, lower))
        materialized_deltas = (self.pool.apply(collect_expr,
                                               (deltas, lower)) if deltas
                               is not None else pd.DataFrame(columns=colnames))

        if materialized_checkpoints is not None:
            materialized_expr = pd.concat(
                (
                    materialized_checkpoints,
                    materialized_expr.get(),
                ),
                ignore_index=True,
                copy=False,
            )

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype(
                    'datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        def last_in_date_group(df, reindex, have_sids=have_sids):
            idx = dates[dates.searchsorted(
                df[TS_FIELD_NAME].values.astype('datetime64[D]'))]
            if have_sids:
                idx = [idx, SID_FIELD_NAME]

            last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby(
                idx,
                sort=False,
            ).last()

            if have_sids:
                last_in_group = last_in_group.unstack()

            if reindex:
                if have_sids:
                    cols = last_in_group.columns
                    last_in_group = last_in_group.reindex(
                        index=dates,
                        columns=pd.MultiIndex.from_product(
                            (cols.levels[0], assets),
                            names=cols.names,
                        ),
                    )
                else:
                    last_in_group = last_in_group.reindex(dates)

            return last_in_group

        sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False)
        dense_output = last_in_date_group(sparse_output, reindex=True)
        dense_output.ffill(inplace=True)

        # Fill in missing values specified by each column. This is made
        # significantly more complex by the fact that we need to work around
        # two pandas issues:

        # 1) When we have sids, if there are no records for a given sid for any
        #    dates, pandas will generate a column full of NaNs for that sid.
        #    This means that some of the columns in `dense_output` are now
        #    float instead of the intended dtype, so we have to coerce back to
        #    our expected type and convert NaNs into the desired missing value.

        # 2) DataFrame.ffill assumes that receiving None as a fill-value means
        #    that no value was passed.  Consequently, there's no way to tell
        #    pandas to replace NaNs in an object column with None using fillna,
        #    so we have to roll our own instead using df.where.
        for column in columns:
            # Special logic for strings since `fillna` doesn't work if the
            # missing value is `None`.
            if column.dtype == categorical_dtype:
                dense_output[column.name] = dense_output[column.name].where(
                    pd.notnull(dense_output[column.name]),
                    column.missing_value)
            else:
                # We need to execute `fillna` before `astype` in case the
                # column contains NaNs and needs to be cast to bool or int.
                # This is so that the NaNs are replaced first, since pandas
                # can't convert NaNs for those types.
                dense_output[column.name] = dense_output[column.name].fillna(
                    column.missing_value).astype(column.dtype)

        if have_sids:
            adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # If we do not have sids, use the column view to make a single
            # column vector which is unassociated with any assets.
            column_view = op.itemgetter(np.s_[:, np.newaxis])

            adjustments_from_deltas = adjustments_from_deltas_no_sids
            mask = np.full(
                shape=(len(mask), 1),
                fill_value=True,
                dtype=bool_dtype,
            )

        return {
            column: AdjustedArray(
                column_view(
                    dense_output[column.name].values.astype(column.dtype), ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column.name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
            for column_idx, column in enumerate(columns)
        }
Пример #9
0
    def test_novel_deltas(self, asset_info):
        base_dates = pd.DatetimeIndex([
            pd.Timestamp('2014-01-01'),
            pd.Timestamp('2014-01-04')
        ])
        repeated_dates = base_dates.repeat(3)
        baseline = pd.DataFrame({
            'sid': self.sids * 2,
            'value': (0., 1., 2., 1., 2., 3.),
            'int_value': (0, 1, 2, 1, 2, 3),
            'asof_date': repeated_dates,
            'timestamp': repeated_dates,
        })
        expr = bz.data(baseline, name='expr', dshape=self.dshape)
        deltas = bz.data(
            odo(
                bz.transform(
                    expr,
                    value=expr.value + 10,
                    timestamp=expr.timestamp + timedelta(days=1),
                ),
                pd.DataFrame,
            ),
            name='delta',
            dshape=self.dshape,
        )
        expected_views = keymap(pd.Timestamp, {
            '2014-01-03': np.array([[10.0, 11.0, 12.0],
                                    [10.0, 11.0, 12.0],
                                    [10.0, 11.0, 12.0]]),
            '2014-01-06': np.array([[10.0, 11.0, 12.0],
                                    [10.0, 11.0, 12.0],
                                    [11.0, 12.0, 13.0]]),
        })
        if len(asset_info) == 4:
            expected_views = valmap(
                lambda view: np.c_[view, [np.nan, np.nan, np.nan]],
                expected_views,
            )
            expected_output_buffer = [10, 11, 12, np.nan, 11, 12, 13, np.nan]
        else:
            expected_output_buffer = [10, 11, 12, 11, 12, 13]

        cal = pd.DatetimeIndex([
            pd.Timestamp('2014-01-01'),
            pd.Timestamp('2014-01-02'),
            pd.Timestamp('2014-01-03'),
            # omitting the 4th and 5th to simulate a weekend
            pd.Timestamp('2014-01-06'),
        ])

        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                expected_output_buffer,
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value',),
            )
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=cal,
                start=cal[2],
                end=cal[-1],
                window_length=3,
                compute_fn=op.itemgetter(-1),
            )