def test_categorical_df_concat_value_error(self): mismatched_dtypes = [ pd.DataFrame({ 'A': pd.Series(['a', 'b', 'c'], dtype='category'), 'B': pd.Series([100, 102, 103], dtype='int64'), }), pd.DataFrame({ 'A': pd.Series(['c', 'b', 'd'], dtype='category'), 'B': pd.Series([103, 102, 104], dtype='float64'), }), ] mismatched_column_names = [ pd.DataFrame({ 'A': pd.Series(['a', 'b', 'c'], dtype='category'), 'B': pd.Series([100, 102, 103], dtype='int64'), }), pd.DataFrame({ 'A': pd.Series(['c', 'b', 'd'], dtype='category'), 'X': pd.Series([103, 102, 104], dtype='int64'), }), ] with self.assertRaises(ValueError) as cm: categorical_df_concat(mismatched_dtypes) self.assertEqual( str(cm.exception), "Input DataFrames must have the same columns/dtypes.") with self.assertRaises(ValueError) as cm: categorical_df_concat(mismatched_column_names) self.assertEqual( str(cm.exception), "Input DataFrames must have the same columns/dtypes.")
def test_categorical_df_concat_value_error(self): mismatched_dtypes = [ pd.DataFrame({ "A": pd.Series(["a", "b", "c"], dtype="category"), "B": pd.Series([100, 102, 103], dtype="int64"), }), pd.DataFrame({ "A": pd.Series(["c", "b", "d"], dtype="category"), "B": pd.Series([103, 102, 104], dtype="float64"), }), ] mismatched_column_names = [ pd.DataFrame({ "A": pd.Series(["a", "b", "c"], dtype="category"), "B": pd.Series([100, 102, 103], dtype="int64"), }), pd.DataFrame({ "A": pd.Series(["c", "b", "d"], dtype="category"), "X": pd.Series([103, 102, 104], dtype="int64"), }), ] with self.assertRaises(ValueError) as cm: categorical_df_concat(mismatched_dtypes) self.assertEqual( str(cm.exception), "Input DataFrames must have the same columns/dtypes.") with self.assertRaises(ValueError) as cm: categorical_df_concat(mismatched_column_names) self.assertEqual( str(cm.exception), "Input DataFrames must have the same columns/dtypes.")
def test_categorical_df_concat_value_error(self): mismatched_dtypes = [ pd.DataFrame({ "A": pd.Series(["a", "b", "c"], dtype="category"), "B": pd.Series([100, 102, 103], dtype="int64"), }), pd.DataFrame({ "A": pd.Series(["c", "b", "d"], dtype="category"), "B": pd.Series([103, 102, 104], dtype="float64"), }), ] mismatched_column_names = [ pd.DataFrame({ "A": pd.Series(["a", "b", "c"], dtype="category"), "B": pd.Series([100, 102, 103], dtype="int64"), }), pd.DataFrame({ "A": pd.Series(["c", "b", "d"], dtype="category"), "X": pd.Series([103, 102, 104], dtype="int64"), }), ] with pytest.raises( ValueError, match="Input DataFrames must have the same columns/dtypes."): categorical_df_concat(mismatched_dtypes) with pytest.raises( ValueError, match="Input DataFrames must have the same columns/dtypes."): categorical_df_concat(mismatched_column_names)
def test_categorical_df_concat_value_error(self): mismatched_dtypes = [ pd.DataFrame( { 'A': pd.Series(['a', 'b', 'c'], dtype='category'), 'B': pd.Series([100, 102, 103], dtype='int64'), } ), pd.DataFrame( { 'A': pd.Series(['c', 'b', 'd'], dtype='category'), 'B': pd.Series([103, 102, 104], dtype='float64'), } ), ] mismatched_column_names = [ pd.DataFrame( { 'A': pd.Series(['a', 'b', 'c'], dtype='category'), 'B': pd.Series([100, 102, 103], dtype='int64'), } ), pd.DataFrame( { 'A': pd.Series(['c', 'b', 'd'], dtype='category'), 'X': pd.Series([103, 102, 104], dtype='int64'), } ), ] with self.assertRaises(ValueError) as cm: categorical_df_concat(mismatched_dtypes) self.assertEqual( str(cm.exception), "Input DataFrames must have the same columns/dtypes." ) with self.assertRaises(ValueError) as cm: categorical_df_concat(mismatched_column_names) self.assertEqual( str(cm.exception), "Input DataFrames must have the same columns/dtypes." )
def run_chunked_pipeline(self, pipeline, start_date, end_date, chunksize): ranges = compute_date_range_chunks( self._calendar, start_date, end_date, chunksize, ) chunks = [self.run_pipeline(pipeline, s, e) for s, e in ranges] return categorical_df_concat(chunks, inplace=True)
def test_categorical_df_concat(self): inp = [ pd.DataFrame( { 'A': pd.Series(['a', 'b', 'c'], dtype='category'), 'B': pd.Series([100, 102, 103], dtype='int64'), 'C': pd.Series(['x', 'x', 'x'], dtype='category'), } ), pd.DataFrame( { 'A': pd.Series(['c', 'b', 'd'], dtype='category'), 'B': pd.Series([103, 102, 104], dtype='int64'), 'C': pd.Series(['y', 'y', 'y'], dtype='category'), } ), pd.DataFrame( { 'A': pd.Series(['a', 'b', 'd'], dtype='category'), 'B': pd.Series([101, 102, 104], dtype='int64'), 'C': pd.Series(['z', 'z', 'z'], dtype='category'), } ), ] result = categorical_df_concat(inp) expected = pd.DataFrame( { 'A': pd.Series( ['a', 'b', 'c', 'c', 'b', 'd', 'a', 'b', 'd'], dtype='category' ), 'B': pd.Series( [100, 102, 103, 103, 102, 104, 101, 102, 104], dtype='int64' ), 'C': pd.Series( ['x', 'x', 'x', 'y', 'y', 'y', 'z', 'z', 'z'], dtype='category' ), }, ) expected.index = pd.Int64Index([0, 1, 2, 0, 1, 2, 0, 1, 2]) assert_equal(expected, result) assert_equal( expected['A'].cat.categories, result['A'].cat.categories ) assert_equal( expected['C'].cat.categories, result['C'].cat.categories )
def run_chunked_pipeline(self, pipeline, start_date, end_date, chunksize): ranges = compute_date_range_chunks( self._calendar, start_date, end_date, chunksize, ) chunks = [self.run_pipeline(pipeline, s, e) for s, e in ranges] if len(chunks) == 1: # OPTIMIZATION: Don't make an extra copy in `categorical_df_concat` # if we don't have to. return chunks[0] return categorical_df_concat(chunks, inplace=True)
def test_categorical_df_concat(self): inp = [ pd.DataFrame({ 'A': pd.Series(['a', 'b', 'c'], dtype='category'), 'B': pd.Series([100, 102, 103], dtype='int64'), 'C': pd.Series(['x', 'x', 'x'], dtype='category'), }), pd.DataFrame({ 'A': pd.Series(['c', 'b', 'd'], dtype='category'), 'B': pd.Series([103, 102, 104], dtype='int64'), 'C': pd.Series(['y', 'y', 'y'], dtype='category'), }), pd.DataFrame({ 'A': pd.Series(['a', 'b', 'd'], dtype='category'), 'B': pd.Series([101, 102, 104], dtype='int64'), 'C': pd.Series(['z', 'z', 'z'], dtype='category'), }), ] result = categorical_df_concat(inp) expected = pd.DataFrame( { 'A': pd.Series(['a', 'b', 'c', 'c', 'b', 'd', 'a', 'b', 'd'], dtype='category'), 'B': pd.Series([100, 102, 103, 103, 102, 104, 101, 102, 104], dtype='int64'), 'C': pd.Series(['x', 'x', 'x', 'y', 'y', 'y', 'z', 'z', 'z'], dtype='category'), }, ) expected.index = pd.Int64Index([0, 1, 2, 0, 1, 2, 0, 1, 2]) assert_equal(expected, result) assert_equal(expected['A'].cat.categories, result['A'].cat.categories) assert_equal(expected['C'].cat.categories, result['C'].cat.categories)
def test_categorical_df_concat(self): inp = [ pd.DataFrame({ "A": pd.Series(["a", "b", "c"], dtype="category"), "B": pd.Series([100, 102, 103], dtype="int64"), "C": pd.Series(["x", "x", "x"], dtype="category"), }), pd.DataFrame({ "A": pd.Series(["c", "b", "d"], dtype="category"), "B": pd.Series([103, 102, 104], dtype="int64"), "C": pd.Series(["y", "y", "y"], dtype="category"), }), pd.DataFrame({ "A": pd.Series(["a", "b", "d"], dtype="category"), "B": pd.Series([101, 102, 104], dtype="int64"), "C": pd.Series(["z", "z", "z"], dtype="category"), }), ] result = categorical_df_concat(inp) expected = pd.DataFrame( { "A": pd.Series(["a", "b", "c", "c", "b", "d", "a", "b", "d"], dtype="category"), "B": pd.Series([100, 102, 103, 103, 102, 104, 101, 102, 104], dtype="int64"), "C": pd.Series(["x", "x", "x", "y", "y", "y", "z", "z", "z"], dtype="category"), }, ) expected.index = pd.Int64Index([0, 1, 2, 0, 1, 2, 0, 1, 2]) assert_equal(expected, result) assert_equal(expected["A"].cat.categories, result["A"].cat.categories) assert_equal(expected["C"].cat.categories, result["C"].cat.categories)
def run_chunked_pipeline(self, pipeline, start_date, end_date, chunksize, hooks=None): """ Compute values for ``pipeline`` from ``start_date`` to ``end_date``, in date chunks of size ``chunksize``. Chunked execution reduces memory consumption, and may reduce computation time depending on the contents of your pipeline. Parameters ---------- pipeline : Pipeline The pipeline to run. start_date : pd.Timestamp The start date to run the pipeline for. end_date : pd.Timestamp The end date to run the pipeline for. chunksize : int The number of days to execute at a time. hooks : list[implements(PipelineHooks)], optional Hooks for instrumenting Pipeline execution. Returns ------- result : pd.DataFrame A frame of computed results. The ``result`` columns correspond to the entries of `pipeline.columns`, which should be a dictionary mapping strings to instances of :class:`zipline.pipeline.term.Term`. For each date between ``start_date`` and ``end_date``, ``result`` will contain a row for each asset that passed `pipeline.screen`. A screen of ``None`` indicates that a row should be returned for each asset that existed each day. See Also -------- :meth:`zipline.pipeline.engine.PipelineEngine.run_pipeline` """ domain = self.resolve_domain(pipeline) ranges = compute_date_range_chunks( domain.all_sessions(), start_date, end_date, chunksize, ) hooks = self._resolve_hooks(hooks) run_pipeline = partial(self._run_pipeline_impl, pipeline, hooks=hooks) with hooks.running_pipeline(pipeline, start_date, end_date): chunks = [run_pipeline(s, e) for s, e in ranges] if len(chunks) == 1: # OPTIMIZATION: Don't make an extra copy in `categorical_df_concat` # if we don't have to. return chunks[0] return categorical_df_concat(chunks, inplace=True)