Exemplo n.º 1
0
    def as_categorical(self):
        """
        Coerce self into a pandas categorical.

        This is only defined on 1D arrays, since that's all pandas supports.
        """
        if len(self.shape) > 1:
            raise ValueError("Can't convert a 2D array to a categorical.")

        with ignore_pandas_nan_categorical_warning():
            return pd.Categorical.from_codes(
                self.as_int_array(),
                # We need to make a copy because pandas >= 0.17 fails if this
                # buffer isn't writeable.
                self.categories.copy(),
                ordered=False,
            )
Exemplo n.º 2
0
    def test_latest(self):
        columns = TDS.columns
        pipe = Pipeline(columns={c.name: c.latest for c in columns}, )

        cal_slice = slice(20, 40)
        dates_to_test = self.trading_days[cal_slice]
        result = self.engine.run_pipeline(
            pipe,
            dates_to_test[0],
            dates_to_test[-1],
        )
        for column in columns:
            with ignore_pandas_nan_categorical_warning():
                col_result = result[column.name].unstack()

            expected_col_result = self.expected_latest(column, cal_slice)
            assert_frame_equal(col_result, expected_col_result)
Exemplo n.º 3
0
    def as_categorical(self):
        """
        Coerce self into a pandas categorical.

        This is only defined on 1D arrays, since that's all pandas supports.
        """
        if len(self.shape) > 1:
            raise ValueError("Can't convert a 2D array to a categorical.")

        with ignore_pandas_nan_categorical_warning():
            return pd.Categorical.from_codes(
                self.as_int_array(),
                # We need to make a copy because pandas >= 0.17 fails if this
                # buffer isn't writeable.
                self.categories.copy(),
                ordered=False,
            )
Exemplo n.º 4
0
    def test_latest(self):
        columns = TDS.columns
        pipe = Pipeline(
            columns={c.name: c.latest for c in columns},
        )

        cal_slice = slice(20, 40)
        dates_to_test = self.trading_days[cal_slice]
        result = self.engine.run_pipeline(
            pipe,
            dates_to_test[0],
            dates_to_test[-1],
        )
        for column in columns:
            with ignore_pandas_nan_categorical_warning():
                col_result = result[column.name].unstack()

            expected_col_result = self.expected_latest(column, cal_slice)
            assert_frame_equal(col_result, expected_col_result)
Exemplo n.º 5
0
    def _load_dataset(self, dates, data_query_cutoff_times, assets, mask,
                      columns):
        try:
            (expr_data, ) = {self._table_expressions[c] for c in columns}
        except ValueError:
            raise AssertionError(
                'all columns must share the same expression data', )

        expr, deltas, checkpoints, odo_kwargs = expr_data

        have_sids = (first(columns).dataset.ndim == 2)
        added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME
                              } | ({SID_FIELD_NAME} if have_sids else set())
        requested_columns = set(map(getname, columns))
        colnames = sorted(added_query_fields | requested_columns)

        lower_dt, upper_dt = data_query_cutoff_times[[0, -1]]

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] < upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        lower, materialized_checkpoints = get_materialized_checkpoints(
            checkpoints, colnames, lower_dt, odo_kwargs)

        materialized_expr_deferred = self.pool.apply_async(
            collect_expr,
            (expr, lower),
        )
        materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower))
                               if deltas is not None else None)

        # If the rows that come back from the blaze backend are constructed
        # from LabelArrays with Nones in the categories, pandas
        # complains. Ignore those warnings for now until we have a story for
        # updating our categorical missing values to NaN.
        with ignore_pandas_nan_categorical_warning():
            all_rows = pd.concat(
                filter(
                    lambda df: df is not None,
                    (
                        materialized_checkpoints,
                        materialized_expr_deferred.get(),
                        materialized_deltas,
                    ),
                ),
                ignore_index=True,
                copy=False,
            )

        all_rows[TS_FIELD_NAME] = all_rows[TS_FIELD_NAME].astype(
            'datetime64[ns]', )
        all_rows.sort_values([TS_FIELD_NAME, AD_FIELD_NAME], inplace=True)

        if have_sids:
            return adjusted_arrays_from_rows_with_assets(
                dates,
                data_query_cutoff_times,
                assets,
                columns,
                all_rows,
            )
        else:
            return adjusted_arrays_from_rows_without_assets(
                dates,
                data_query_cutoff_times,
                columns,
                all_rows,
            )
Exemplo n.º 6
0
    def _load_dataset(self,
                      dates,
                      data_query_cutoff_times,
                      assets,
                      mask,
                      columns):
        try:
            (expr_data,) = {self._table_expressions[c] for c in columns}
        except ValueError:
            raise AssertionError(
                'all columns must share the same expression data',
            )

        expr, deltas, checkpoints, odo_kwargs = expr_data

        have_sids = (first(columns).dataset.ndim == 2)
        added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME} | (
            {SID_FIELD_NAME} if have_sids else set()
        )
        requested_columns = set(map(getname, columns))
        colnames = sorted(added_query_fields | requested_columns)

        lower_dt, upper_dt = data_query_cutoff_times[[0, -1]]

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] < upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        lower, materialized_checkpoints = get_materialized_checkpoints(
            checkpoints, colnames, lower_dt, odo_kwargs
        )

        materialized_expr_deferred = self.pool.apply_async(
            collect_expr,
            (expr, lower),
        )
        materialized_deltas = (
            self.pool.apply(collect_expr, (deltas, lower))
            if deltas is not None else
            None
        )

        # If the rows that come back from the blaze backend are constructed
        # from LabelArrays with Nones in the categories, pandas
        # complains. Ignore those warnings for now until we have a story for
        # updating our categorical missing values to NaN.
        with ignore_pandas_nan_categorical_warning():
            all_rows = pd.concat(
                filter(
                    lambda df: df is not None, (
                        materialized_checkpoints,
                        materialized_expr_deferred.get(),
                        materialized_deltas,
                    ),
                ),
                ignore_index=True,
                copy=False,
            )

        all_rows[TS_FIELD_NAME] = all_rows[TS_FIELD_NAME].astype(
            'datetime64[ns]',
        )
        all_rows.sort_values([TS_FIELD_NAME, AD_FIELD_NAME], inplace=True)

        if have_sids:
            return adjusted_arrays_from_rows_with_assets(
                dates,
                data_query_cutoff_times,
                assets,
                columns,
                all_rows,
            )
        else:
            return adjusted_arrays_from_rows_without_assets(
                dates,
                data_query_cutoff_times,
                columns,
                all_rows,
            )