def test_compute_date_range_chunks(self, chunksize, expected): # This date range results in 20 business days start_date = T("2017-01-03") end_date = T("2017-01-31") date_ranges = compute_date_range_chunks(self.calendar.all_sessions, start_date, end_date, chunksize) assert list(date_ranges) == expected
def test_compute_date_range_chunks(self, chunksize, expected): # This date range results in 20 business days start_date = T('2017-01-03') end_date = T('2017-01-31') date_ranges = compute_date_range_chunks(self.calendar.all_sessions, start_date, end_date, chunksize) self.assertListEqual(list(date_ranges), expected)
def run_chunked_pipeline(self, pipeline, start_date, end_date, chunksize): ranges = compute_date_range_chunks( self._calendar, start_date, end_date, chunksize, ) chunks = [self.run_pipeline(pipeline, s, e) for s, e in ranges] return categorical_df_concat(chunks, inplace=True)
def test_compute_date_range_chunks_invalid_input(self): # Start date not found in calendar with self.assertRaises(KeyError) as cm: compute_date_range_chunks( self.calendar.all_sessions, T('2017-05-07'), # Sunday T('2017-06-01'), None) self.assertEqual(str(cm.exception), "'Start date 2017-05-07 is not found in calendar.'") # End date not found in calendar with self.assertRaises(KeyError) as cm: compute_date_range_chunks( self.calendar.all_sessions, T('2017-05-01'), T('2017-05-27'), # Saturday None) self.assertEqual(str(cm.exception), "'End date 2017-05-27 is not found in calendar.'") # End date before start date with self.assertRaises(ValueError) as cm: compute_date_range_chunks(self.calendar.all_sessions, T('2017-06-01'), T('2017-05-01'), None) self.assertEqual( str(cm.exception), "End date 2017-05-01 cannot precede start date 2017-06-01.")
def test_compute_date_range_chunks_invalid_input(self): # Start date not found in calendar err_msg = "'Start date 2017-05-07 is not found in calendar.'" with pytest.raises(KeyError, match=err_msg): compute_date_range_chunks( self.calendar.all_sessions, T("2017-05-07"), # Sunday T("2017-06-01"), None, ) # End date not found in calendar err_msg = "'End date 2017-05-27 is not found in calendar.'" with pytest.raises(KeyError, match=err_msg): compute_date_range_chunks( self.calendar.all_sessions, T("2017-05-01"), T("2017-05-27"), # Saturday None, ) # End date before start date err_msg = "End date 2017-05-01 cannot precede start date 2017-06-01." with pytest.raises(ValueError, match=err_msg): compute_date_range_chunks(self.calendar.all_sessions, T("2017-06-01"), T("2017-05-01"), None)
def test_compute_date_range_chunks(self, chunksize, expected): # This date range results in 20 business days start_date = T('2017-01-03') end_date = T('2017-01-31') date_ranges = compute_date_range_chunks( self.calendar.all_sessions, start_date, end_date, chunksize ) self.assertListEqual(list(date_ranges), expected)
def run_chunked_pipeline(self, pipeline, start_date, end_date, chunksize): ranges = compute_date_range_chunks( self._calendar, start_date, end_date, chunksize, ) chunks = [self.run_pipeline(pipeline, s, e) for s, e in ranges] if len(chunks) == 1: # OPTIMIZATION: Don't make an extra copy in `categorical_df_concat` # if we don't have to. return chunks[0] return categorical_df_concat(chunks, inplace=True)
def test_compute_date_range_chunks_invalid_input(self): # Start date not found in calendar with self.assertRaises(KeyError) as cm: compute_date_range_chunks( self.calendar.all_sessions, T('2017-05-07'), # Sunday T('2017-06-01'), None ) self.assertEqual( str(cm.exception), "'Start date 2017-05-07 is not found in calendar.'" ) # End date not found in calendar with self.assertRaises(KeyError) as cm: compute_date_range_chunks( self.calendar.all_sessions, T('2017-05-01'), T('2017-05-27'), # Saturday None ) self.assertEqual( str(cm.exception), "'End date 2017-05-27 is not found in calendar.'" ) # End date before start date with self.assertRaises(ValueError) as cm: compute_date_range_chunks( self.calendar.all_sessions, T('2017-06-01'), T('2017-05-01'), None ) self.assertEqual( str(cm.exception), "End date 2017-05-01 cannot precede start date 2017-06-01." )
def run_chunked_pipeline(self, pipeline, start_date, end_date, chunksize, hooks=None): """ Compute values for ``pipeline`` from ``start_date`` to ``end_date``, in date chunks of size ``chunksize``. Chunked execution reduces memory consumption, and may reduce computation time depending on the contents of your pipeline. Parameters ---------- pipeline : Pipeline The pipeline to run. start_date : pd.Timestamp The start date to run the pipeline for. end_date : pd.Timestamp The end date to run the pipeline for. chunksize : int The number of days to execute at a time. hooks : list[implements(PipelineHooks)], optional Hooks for instrumenting Pipeline execution. Returns ------- result : pd.DataFrame A frame of computed results. The ``result`` columns correspond to the entries of `pipeline.columns`, which should be a dictionary mapping strings to instances of :class:`zipline.pipeline.term.Term`. For each date between ``start_date`` and ``end_date``, ``result`` will contain a row for each asset that passed `pipeline.screen`. A screen of ``None`` indicates that a row should be returned for each asset that existed each day. See Also -------- :meth:`zipline.pipeline.engine.PipelineEngine.run_pipeline` """ domain = self.resolve_domain(pipeline) ranges = compute_date_range_chunks( domain.all_sessions(), start_date, end_date, chunksize, ) hooks = self._resolve_hooks(hooks) run_pipeline = partial(self._run_pipeline_impl, pipeline, hooks=hooks) with hooks.running_pipeline(pipeline, start_date, end_date): chunks = [run_pipeline(s, e) for s, e in ranges] if len(chunks) == 1: # OPTIMIZATION: Don't make an extra copy in `categorical_df_concat` # if we don't have to. return chunks[0] return categorical_df_concat(chunks, inplace=True)
def run_chunked_pipeline(self, pipeline, start_date, end_date, chunksize, hooks=None): """ Compute values for ``pipeline`` from ``start_date`` to ``end_date``, in date chunks of size ``chunksize``. Chunked execution reduces memory consumption, and may reduce computation time depending on the contents of your pipeline. Parameters ---------- pipeline : Pipeline The pipeline to run. start_date : pd.Timestamp The start date to run the pipeline for. end_date : pd.Timestamp The end date to run the pipeline for. chunksize : int The number of days to execute at a time. hooks : list[implements(PipelineHooks)], optional Hooks for instrumenting Pipeline execution. Returns ------- result : pd.DataFrame A frame of computed results. The ``result`` columns correspond to the entries of `pipeline.columns`, which should be a dictionary mapping strings to instances of :class:`zipline.pipeline.Term`. For each date between ``start_date`` and ``end_date``, ``result`` will contain a row for each asset that passed `pipeline.screen`. A screen of ``None`` indicates that a row should be returned for each asset that existed each day. See Also -------- :meth:`zipline.pipeline.engine.PipelineEngine.run_pipeline` """ domain = self.resolve_domain(pipeline) ranges = compute_date_range_chunks( domain.all_sessions(), start_date, end_date, chunksize, ) hooks = self._resolve_hooks(hooks) run_pipeline = partial(self._run_pipeline_impl, pipeline, hooks=hooks) with hooks.running_pipeline(pipeline, start_date, end_date): chunks = [run_pipeline(s, e) for s, e in ranges] if len(chunks) == 1: return chunks[0] # Filter out empty chunks. Empty dataframes lose dtype information, # which makes concatenation fail. nonempty_chunks = [c for c in chunks if len(c)] # pandas would fill missing columns with NaT return concat(nonempty_chunks)