Пример #1
0
    def get_last_data_per_qtr(self, assets_with_data, columns, dates):
        """
        Determine the last piece of information we know for each column on each
        date in the index for each sid and quarter.

        Parameters
        ----------
        assets_with_data : pd.Index
            Index of all assets that appear in the raw data given to the
            loader.
        columns : iterable of BoundColumn
            The columns that need to be loaded from the raw data.
        dates : pd.DatetimeIndex
            The calendar of dates for which data should be loaded.

        Returns
        -------
        stacked_last_per_qtr : pd.DataFrame
            A DataFrame indexed by [dates, sid, normalized_quarters] that has
            the latest information for each row of the index, sorted by event
            date.
        last_per_qtr : pd.DataFrame
            A DataFrame with columns that are a MultiIndex of [
            self.estimates.columns, normalized_quarters, sid].
        """
        # Get a DataFrame indexed by date with a MultiIndex of columns of [
        # self.estimates.columns, normalized_quarters, sid], where each cell
        # contains the latest data for that day.
        last_per_qtr = last_in_date_group(
            self.estimates,
            dates,
            assets_with_data,
            reindex=True,
            extra_groupers=[NORMALIZED_QUARTERS],
        )
        # Forward fill values for each quarter/sid/dataset column.
        ffill_across_cols(last_per_qtr, columns, self.name_map)
        # Stack quarter and sid into the index.
        stacked_last_per_qtr = last_per_qtr.stack(
            [SID_FIELD_NAME, NORMALIZED_QUARTERS],
        )
        # Set date index name for ease of reference
        stacked_last_per_qtr.index.set_names(
            SIMULATION_DATES,
            level=0,
            inplace=True,
        )
        stacked_last_per_qtr = stacked_last_per_qtr.sort(
            EVENT_DATE_FIELD_NAME,
        )
        stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] = pd.to_datetime(
            stacked_last_per_qtr[EVENT_DATE_FIELD_NAME]
        )
        return last_per_qtr, stacked_last_per_qtr
Пример #2
0
    def get_last_data_per_qtr(self, assets_with_data, columns, dates):
        """
        Determine the last piece of information we know for each column on each
        date in the index for each sid and quarter.

        Parameters
        ----------
        assets_with_data : pd.Index
            Index of all assets that appear in the raw data given to the
            loader.
        columns : iterable of BoundColumn
            The columns that need to be loaded from the raw data.
        dates : pd.DatetimeIndex
            The calendar of dates for which data should be loaded.

        Returns
        -------
        stacked_last_per_qtr : pd.DataFrame
            A DataFrame indexed by [dates, sid, normalized_quarters] that has
            the latest information for each row of the index, sorted by event
            date.
        last_per_qtr : pd.DataFrame
            A DataFrame with columns that are a MultiIndex of [
            self.estimates.columns, normalized_quarters, sid].
        """
        # Get a DataFrame indexed by date with a MultiIndex of columns of [
        # self.estimates.columns, normalized_quarters, sid], where each cell
        # contains the latest data for that day.
        last_per_qtr = last_in_date_group(
            self.estimates,
            dates,
            assets_with_data,
            reindex=True,
            extra_groupers=[NORMALIZED_QUARTERS],
        )
        # Forward fill values for each quarter/sid/dataset column.
        ffill_across_cols(last_per_qtr, columns, self.name_map)
        # Stack quarter and sid into the index.
        stacked_last_per_qtr = last_per_qtr.stack(
            [SID_FIELD_NAME, NORMALIZED_QUARTERS], )
        # Set date index name for ease of reference
        stacked_last_per_qtr.index.set_names(
            SIMULATION_DATES,
            level=0,
            inplace=True,
        )
        stacked_last_per_qtr = stacked_last_per_qtr.sort(
            EVENT_DATE_FIELD_NAME, )
        stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] = pd.to_datetime(
            stacked_last_per_qtr[EVENT_DATE_FIELD_NAME])
        return last_per_qtr, stacked_last_per_qtr
Пример #3
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset, ) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, checkpoints, odo_kwargs, apply_deltas_adjustments = self[
            dataset]
        have_sids = (dataset.ndim == 2)
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME
                              } | ({SID_FIELD_NAME} if have_sids else set())
        requested_columns = set(map(getname, columns))
        colnames = sorted(added_query_fields | requested_columns)

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] <= upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        lower, materialized_checkpoints = get_materialized_checkpoints(
            checkpoints, colnames, lower_dt, odo_kwargs)

        materialized_expr = self.pool.apply_async(collect_expr, (expr, lower))
        materialized_deltas = (self.pool.apply(collect_expr,
                                               (deltas, lower)) if deltas
                               is not None else pd.DataFrame(columns=colnames))

        if materialized_checkpoints is not None:
            materialized_expr = pd.concat(
                (
                    materialized_checkpoints,
                    materialized_expr.get(),
                ),
                ignore_index=True,
                copy=False,
            )

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype(
                    'datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        if AD_FIELD_NAME not in requested_columns:
            sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        sparse_deltas = last_in_date_group(non_novel_deltas,
                                           dates,
                                           assets,
                                           reindex=False,
                                           have_sids=have_sids)
        dense_output = last_in_date_group(sparse_output,
                                          dates,
                                          assets,
                                          reindex=True,
                                          have_sids=have_sids)
        ffill_across_cols(dense_output, columns,
                          {c.name: c.name
                           for c in columns})

        # By default, no non-novel deltas are applied.
        def no_adjustments_from_deltas(*args):
            return {}

        adjustments_from_deltas = no_adjustments_from_deltas
        if have_sids:
            if apply_deltas_adjustments:
                adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # If we do not have sids, use the column view to make a single
            # column vector which is unassociated with any assets.
            column_view = op.itemgetter(np.s_[:, np.newaxis])
            if apply_deltas_adjustments:
                adjustments_from_deltas = adjustments_from_deltas_no_sids
            mask = np.full(
                shape=(len(mask), 1),
                fill_value=True,
                dtype=bool_dtype,
            )

        return {
            column: AdjustedArray(
                column_view(
                    dense_output[column.name].values.astype(column.dtype), ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column.name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
            for column_idx, column in enumerate(columns)
        }
Пример #4
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset,) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, checkpoints, odo_kwargs = self[dataset]
        have_sids = (dataset.ndim == 2)
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + (
            [SID_FIELD_NAME] if have_sids else []
        )
        colnames = added_query_fields + list(map(getname, columns))

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] <= upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        lower, materialized_checkpoints = get_materialized_checkpoints(
            checkpoints, colnames, lower_dt, odo_kwargs
        )

        materialized_expr = self.pool.apply_async(collect_expr, (expr, lower))
        materialized_deltas = (
            self.pool.apply(collect_expr, (deltas, lower))
            if deltas is not None else
            pd.DataFrame(columns=colnames)
        )

        if materialized_checkpoints is not None:
            materialized_expr = pd.concat(
                (
                    materialized_checkpoints,
                    materialized_expr.get(),
                ),
                ignore_index=True,
                copy=False,
            )

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)
            ]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[
                    :, TS_FIELD_NAME
                ].astype('datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        sparse_deltas = last_in_date_group(non_novel_deltas,
                                           dates,
                                           assets,
                                           reindex=False,
                                           have_sids=have_sids)
        dense_output = last_in_date_group(sparse_output,
                                          dates,
                                          assets,
                                          reindex=True,
                                          have_sids=have_sids)
        ffill_across_cols(dense_output, columns, {c.name: c.name
                                                  for c in columns})
        if have_sids:
            adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # If we do not have sids, use the column view to make a single
            # column vector which is unassociated with any assets.
            column_view = op.itemgetter(np.s_[:, np.newaxis])

            adjustments_from_deltas = adjustments_from_deltas_no_sids
            mask = np.full(
                shape=(len(mask), 1), fill_value=True, dtype=bool_dtype,
            )

        return {
            column: AdjustedArray(
                column_view(
                    dense_output[column.name].values.astype(column.dtype),
                ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column.name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
            for column_idx, column in enumerate(columns)
        }