def as_categorical(self): """ Coerce self into a pandas categorical. This is only defined on 1D arrays, since that's all pandas supports. """ if len(self.shape) > 1: raise ValueError("Can't convert a 2D array to a categorical.") with ignore_pandas_nan_categorical_warning(): return pd.Categorical.from_codes( self.as_int_array(), # We need to make a copy because pandas >= 0.17 fails if this # buffer isn't writeable. self.categories.copy(), ordered=False, )
def test_latest(self): columns = TDS.columns pipe = Pipeline(columns={c.name: c.latest for c in columns}, ) cal_slice = slice(20, 40) dates_to_test = self.trading_days[cal_slice] result = self.engine.run_pipeline( pipe, dates_to_test[0], dates_to_test[-1], ) for column in columns: with ignore_pandas_nan_categorical_warning(): col_result = result[column.name].unstack() expected_col_result = self.expected_latest(column, cal_slice) assert_frame_equal(col_result, expected_col_result)
def as_categorical(self): """ Coerce self into a pandas categorical. This is only defined on 1D arrays, since that's all pandas supports. """ if len(self.shape) > 1: raise ValueError("Can't convert a 2D array to a categorical.") with ignore_pandas_nan_categorical_warning(): return pd.Categorical.from_codes( self.as_int_array(), # We need to make a copy because pandas >= 0.17 fails if this # buffer isn't writeable. self.categories.copy(), ordered=False, )
def test_latest(self): columns = TDS.columns pipe = Pipeline( columns={c.name: c.latest for c in columns}, ) cal_slice = slice(20, 40) dates_to_test = self.trading_days[cal_slice] result = self.engine.run_pipeline( pipe, dates_to_test[0], dates_to_test[-1], ) for column in columns: with ignore_pandas_nan_categorical_warning(): col_result = result[column.name].unstack() expected_col_result = self.expected_latest(column, cal_slice) assert_frame_equal(col_result, expected_col_result)
def _load_dataset(self, dates, data_query_cutoff_times, assets, mask, columns): try: (expr_data, ) = {self._table_expressions[c] for c in columns} except ValueError: raise AssertionError( 'all columns must share the same expression data', ) expr, deltas, checkpoints, odo_kwargs = expr_data have_sids = (first(columns).dataset.ndim == 2) added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME } | ({SID_FIELD_NAME} if have_sids else set()) requested_columns = set(map(getname, columns)) colnames = sorted(added_query_fields | requested_columns) lower_dt, upper_dt = data_query_cutoff_times[[0, -1]] def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] < upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) lower, materialized_checkpoints = get_materialized_checkpoints( checkpoints, colnames, lower_dt, odo_kwargs) materialized_expr_deferred = self.pool.apply_async( collect_expr, (expr, lower), ) materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower)) if deltas is not None else None) # If the rows that come back from the blaze backend are constructed # from LabelArrays with Nones in the categories, pandas # complains. Ignore those warnings for now until we have a story for # updating our categorical missing values to NaN. with ignore_pandas_nan_categorical_warning(): all_rows = pd.concat( filter( lambda df: df is not None, ( materialized_checkpoints, materialized_expr_deferred.get(), materialized_deltas, ), ), ignore_index=True, copy=False, ) all_rows[TS_FIELD_NAME] = all_rows[TS_FIELD_NAME].astype( 'datetime64[ns]', ) all_rows.sort_values([TS_FIELD_NAME, AD_FIELD_NAME], inplace=True) if have_sids: return adjusted_arrays_from_rows_with_assets( dates, data_query_cutoff_times, assets, columns, all_rows, ) else: return adjusted_arrays_from_rows_without_assets( dates, data_query_cutoff_times, columns, all_rows, )
def _load_dataset(self, dates, data_query_cutoff_times, assets, mask, columns): try: (expr_data,) = {self._table_expressions[c] for c in columns} except ValueError: raise AssertionError( 'all columns must share the same expression data', ) expr, deltas, checkpoints, odo_kwargs = expr_data have_sids = (first(columns).dataset.ndim == 2) added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME} | ( {SID_FIELD_NAME} if have_sids else set() ) requested_columns = set(map(getname, columns)) colnames = sorted(added_query_fields | requested_columns) lower_dt, upper_dt = data_query_cutoff_times[[0, -1]] def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] < upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) lower, materialized_checkpoints = get_materialized_checkpoints( checkpoints, colnames, lower_dt, odo_kwargs ) materialized_expr_deferred = self.pool.apply_async( collect_expr, (expr, lower), ) materialized_deltas = ( self.pool.apply(collect_expr, (deltas, lower)) if deltas is not None else None ) # If the rows that come back from the blaze backend are constructed # from LabelArrays with Nones in the categories, pandas # complains. Ignore those warnings for now until we have a story for # updating our categorical missing values to NaN. with ignore_pandas_nan_categorical_warning(): all_rows = pd.concat( filter( lambda df: df is not None, ( materialized_checkpoints, materialized_expr_deferred.get(), materialized_deltas, ), ), ignore_index=True, copy=False, ) all_rows[TS_FIELD_NAME] = all_rows[TS_FIELD_NAME].astype( 'datetime64[ns]', ) all_rows.sort_values([TS_FIELD_NAME, AD_FIELD_NAME], inplace=True) if have_sids: return adjusted_arrays_from_rows_with_assets( dates, data_query_cutoff_times, assets, columns, all_rows, ) else: return adjusted_arrays_from_rows_without_assets( dates, data_query_cutoff_times, columns, all_rows, )