def _slice_locs(self, start_date, end_date): try: start = self._calendar.get_loc(start_date) except KeyError: if start_date < self._calendar[0]: raise NoFurtherDataError(msg=( "FFC Query requesting data starting on {query_start}, " "but first known date is {calendar_start}").format( query_start=str(start_date), calendar_start=str(self._calendar[0]), )) else: raise ValueError("Query start %s not in calendar" % start_date) try: stop = self._calendar.get_loc(end_date) except: if end_date > self._calendar[-1]: raise NoFurtherDataError( msg=("FFC Query requesting data up to {query_end}, " "but last known date is {calendar_end}").format( query_end=end_date, calendar_end=self._calendar[-1], )) else: raise ValueError("Query end %s not in calendar" % end_date) return start, stop
def shift_dates(dates, start_date, end_date, shift): """ Shift dates of a pipeline query back by `shift` days. load_adjusted_array is called with dates on which the user's algo will be shown data, which means we need to return the data that would be known at the start of each date. This is often labeled with a previous date in the underlying data (e.g. at the start of today, we have the data as of yesterday). In this case, we can shift the query dates back to query the appropriate values. Parameters ---------- dates : DatetimeIndex All known dates. start_date : pd.Timestamp Start date of the pipeline query. end_date : pd.Timestamp End date of the pipeline query. shift : int The number of days to shift back the query dates. """ try: start = dates.get_loc(start_date) except KeyError: if start_date < dates[0]: raise NoFurtherDataError(msg=( "Pipeline Query requested data starting on {query_start}, " "but first known date is {calendar_start}").format( query_start=str(start_date), calendar_start=str(dates[0]), )) else: raise ValueError("Query start %s not in calendar" % start_date) # Make sure that shifting doesn't push us out of the calendar. if start < shift: raise NoFurtherDataError( msg=("Pipeline Query requested data from {shift}" " days before {query_start}, but first known date is only " "{start} days earlier.").format(shift=shift, query_start=start_date, start=start), ) try: end = dates.get_loc(end_date) except KeyError: if end_date > dates[-1]: raise NoFurtherDataError( msg=("Pipeline Query requesting data up to {query_end}, " "but last known date is {calendar_end}").format( query_end=end_date, calendar_end=dates[-1], )) else: raise ValueError("Query end %s not in calendar" % end_date) return dates[start - shift], dates[end - shift]
def add_scheduled_days(n, date, next_scheduled_day_hook, previous_scheduled_day_hook, all_trading_days): """ Adds n trading days to date. If this would fall outside of the trading calendar, a NoFurtherDataError is raised. Parameters ---------- n : int The number of days to add to date, this can be positive or negative. date : datetime The date to add to. Returns ------- datetime n trading days added to date. """ if n == 1: return next_scheduled_day_hook(date) if n == -1: return previous_scheduled_day_hook(date) idx = _get_index(date, all_trading_days) + n if idx < 0 or idx >= len(all_trading_days): raise NoFurtherDataError(msg='Cannot add %d days to %s' % (n, date)) return all_trading_days[idx]
def add_trading_days(self, n, date): """ Adds n trading days to date. If this would fall outside of the trading calendar, a NoFurtherDataError is raised. :Arguments: n : int The number of days to add to date, this can be positive or negative. date : datetime The date to add to. :Returns: new_date : datetime n trading days added to date. """ if n == 1: return self.next_trading_day(date) if n == -1: return self.previous_trading_day(date) idx = self.get_index(date) + n if idx < 0 or idx >= len(self.trading_days): raise NoFurtherDataError( msg='Cannot add %d days to %s' % (n, date) ) return self.trading_days[idx]
def _compute_root_mask(self, start_date, end_date, extra_rows): """ Compute a lifetimes matrix from our AssetFinder, then drop columns that didn't exist at all during the query dates. Parameters ---------- start_date : pd.Timestamp Base start date for the matrix. end_date : pd.Timestamp End date for the matrix. extra_rows : int Number of extra rows to compute before `start_date`. Extra rows are needed by terms like moving averages that require a trailing window of data. Returns ------- lifetimes : pd.DataFrame Frame of dtype `bool` containing dates from `extra_rows` days before `start_date`, continuing through to `end_date`. The returned frame contains as columns all assets in our AssetFinder that existed for at least one day between `start_date` and `end_date`. """ calendar = self._calendar finder = self._finder start_idx, end_idx = self._calendar.slice_locs(start_date, end_date) if start_idx < extra_rows: raise NoFurtherDataError( msg="Insufficient data to compute Pipeline mask: " "start date was %s, " "earliest known date was %s, " "and %d extra rows were requested." % ( start_date, calendar[0], extra_rows, ), ) # Build lifetimes matrix reaching back to `extra_rows` days before # `start_date.` lifetimes = finder.lifetimes( calendar[start_idx - extra_rows:end_idx], include_start_date=False ) assert lifetimes.index[extra_rows] == start_date assert lifetimes.index[-1] == end_date if not lifetimes.columns.unique: columns = lifetimes.columns duplicated = columns[columns.duplicated()].unique() raise AssertionError("Duplicated sids: %d" % duplicated) # Filter out columns that didn't exist between the requested start and # end dates. existed = lifetimes.iloc[extra_rows:].any() ret = lifetimes.loc[:, existed] shape = ret.shape assert shape[0] * shape[1] != 0, 'root mask cannot be empty' return ret
def _compute_root_mask(self, start_date, end_date, extra_rows): """ Compute a lifetimes matrix from our AssetFinder, then drop columns that didn't exist at all during the query dates. Parameters ---------- start_date : pd.Timestamp Base start date for the matrix. end_date : pd.Timestamp End date for the matrix. extra_rows : int Number of extra rows to compute before `start_date`. Extra rows are needed by terms like moving averages that require a trailing window of data. Returns ------- lifetimes : pd.DataFrame Frame of dtype `bool` containing dates from `extra_rows` days before `start_date`, continuing through to `end_date`. The returned frame contains as columns all assets in our AssetFinder that existed for at least one day between `start_date` and `end_date`. """ calendar = self._calendar finder = self._finder start_idx, end_idx = self._calendar.slice_locs(start_date, end_date) if start_idx < extra_rows: raise NoFurtherDataError.from_lookback_window( initial_message="Insufficient data to compute Pipeline:", first_date=calendar[0], lookback_start=start_date, lookback_length=extra_rows, ) # Build lifetimes matrix reaching back to `extra_rows` days before # `start_date.` lifetimes = finder.lifetimes( calendar[start_idx - extra_rows:end_idx], include_start_date=False ) assert lifetimes.index[extra_rows] == start_date assert lifetimes.index[-1] == end_date if not lifetimes.columns.unique: columns = lifetimes.columns duplicated = columns[columns.duplicated()].unique() raise AssertionError("Duplicated sids: %d" % duplicated) # Filter out columns that didn't exist between the requested start and # end dates. existed = lifetimes.iloc[extra_rows:].any() ret = lifetimes.loc[:, existed] shape = ret.shape assert shape[0] * shape[1] != 0, 'root mask cannot be empty' return ret
def _compute_root_mask(self, start_date, end_date, extra_rows): """ Compute a lifetimes matrix from our AssetFinder, then drop columns that didn't exist at all during the query dates. Parameters ---------- start_date : pd.Timestamp Base start date for the matrix. end_date : pd.Timestamp End date for the matrix. extra_rows : int Number of extra rows to compute before `start_date`. Extra rows are needed by terms like moving averages that require a trailing window of data. Returns ------- lifetimes : pd.DataFrame Frame of dtype `bool` containing dates from `extra_rows` days before `start_date`, continuing through to `end_date`. The returned frame contains as columns all assets in our AssetFinder that existed for at least one day between `start_date` and `end_date`. """ calendar = self._calendar start_idx, end_idx = calendar.slice_locs(start_date, end_date) if start_idx < extra_rows: raise NoFurtherDataError.from_lookback_window( initial_message="Insufficient data to compute Pipeline:", first_date=calendar[0], lookback_start=start_date, lookback_length=extra_rows, ) # Build lifetimes matrix reaching back to `extra_rows` days before # `start_date.` symbols = self._list_symbols() dates = calendar[start_idx - extra_rows:end_idx] symbols = sorted(symbols) lifetimes = pd.DataFrame(True, index=dates, columns=symbols) assert lifetimes.index[extra_rows] == start_date assert lifetimes.index[-1] == end_date if not lifetimes.columns.unique: columns = lifetimes.columns duplicated = columns[columns.duplicated()].unique() raise AssertionError("Duplicated sids: %d" % duplicated) # Filter out columns that didn't exist between the requested start and # end dates. existed = lifetimes.iloc[extra_rows:].any() ret = lifetimes.loc[:, existed] shape = ret.shape assert shape[0] * shape[1] != 0, 'root mask cannot be empty' return ret
def _shift_dates(dates, start_date, end_date, shift): try: start = dates.get_loc(start_date) except KeyError: if start_date < dates[0]: raise NoFurtherDataError( msg=( "Pipeline Query requested data starting on {query_start}, " "but first known date is {calendar_start}" ).format( query_start=str(start_date), calendar_start=str(dates[0]), ) ) else: raise ValueError("Query start %s not in calendar" % start_date) # Make sure that shifting doesn't push us out of the calendar. if start < shift: raise NoFurtherDataError( msg=( "Pipeline Query requested data from {shift}" " days before {query_start}, but first known date is only " "{start} days earlier." ).format(shift=shift, query_start=start_date, start=start), ) try: end = dates.get_loc(end_date) except KeyError: if end_date > dates[-1]: raise NoFurtherDataError( msg=( "Pipeline Query requesting data up to {query_end}, " "but last known date is {calendar_end}" ).format( query_end=end_date, calendar_end=dates[-1], ) ) else: raise ValueError("Query end %s not in calendar" % end_date) return dates[start - shift], dates[end - shift]
def previous_open_and_close(self, start_date): """ Given the start_date, returns the previous open and close of the market. """ previous = self.previous_trading_day(start_date) if previous is None: raise NoFurtherDataError( msg=("Attempt to backtest beyond available history. " "First known date: %s" % self.first_trading_day)) return self.get_open_and_close(previous)
def next_open_and_close(self, start_date): """ Given the start_date, returns the next open and close of the market. """ next_open = self.next_trading_day(start_date) if next_open is None: raise NoFurtherDataError( msg=("Attempt to backtest beyond available history. " "Last known date: %s" % self.last_trading_day)) return self.get_open_and_close(next_open)
def build_lifetimes_matrix(self, start_date, end_date, extra_rows): """ Compute a lifetimes matrix from our AssetFinder, then drop columns that didn't exist at all during the query dates. Parameters ---------- start_date : pd.Timestamp Base start date for the matrix. end_date : pd.Timestamp End date for the matrix. extra_rows : int Number of rows prior to `start_date` to include. Extra rows are needed by terms like moving averages that require a trailing window of data to compute. Returns ------- lifetimes : pd.DataFrame Frame of dtype `bool` containing dates from `extra_rows` days before `start_date`, continuing through to `end_date`. The returned frame contains as columns all assets in our AssetFinder that existed for at least one day between `start_date` and `end_date`. """ calendar = self._calendar finder = self._finder start_idx, end_idx = self._calendar.slice_locs(start_date, end_date) if start_idx < extra_rows: raise NoFurtherDataError( msg="Insufficient data to compute FFC Matrix: " "start date was %s, " "earliest known date was %s, " "and %d extra rows were requested." % ( start_date, calendar[0], extra_rows, ), ) # Build lifetimes matrix reaching back as far start_date plus # max_extra_rows. lifetimes = finder.lifetimes(calendar[start_idx - extra_rows:end_idx]) assert lifetimes.index[extra_rows] == start_date assert lifetimes.index[-1] == end_date # Filter out columns that didn't exist between the requested start and # end dates. existed = lifetimes.iloc[extra_rows:].any() return lifetimes.loc[:, existed]
def _compute_root_mask(self, start_date, end_date, extra_rows): """ Compute a lifetimes matrix from our AssetFinder, then drop columns that didn't exist at all during the query dates. Parameters ---------- start_date : pd.Timestamp Base start date for the matrix. end_date : pd.Timestamp End date for the matrix. extra_rows : int Number of extra rows to compute before `start_date`. Extra rows are needed by terms like moving averages that require a trailing window of data. Returns ------- lifetimes : pd.DataFrame Frame of dtype `bool` containing dates from `extra_rows` days before `start_date`, continuing through to `end_date`. The returned frame contains as columns all assets in our AssetFinder that existed for at least one day between `start_date` and `end_date`. """ calendar = self._calendar finder = self._finder start_idx, end_idx = self._calendar.slice_locs(start_date, end_date) if start_idx < extra_rows: raise NoFurtherDataError.from_lookback_window( initial_message="Insufficient data to compute Pipeline:", first_date=calendar[0], lookback_start=start_date, lookback_length=extra_rows, ) # Build lifetimes matrix reaching back to `extra_rows` days before # `start_date.` lifetimes = finder.lifetimes( calendar[start_idx - extra_rows:end_idx], include_start_date=False ) if lifetimes.index[extra_rows] != start_date: raise ValueError( 'The first date of the lifetimes matrix does not match the' ' start date of the pipeline. Did you forget to align the' ' start_date to the trading calendar?' ) if lifetimes.index[-1] != end_date: raise ValueError( 'The last date of the lifetimes matrix does not match the' ' start date of the pipeline. Did you forget to align the' ' end_date to the trading calendar?' ) if not lifetimes.columns.unique: columns = lifetimes.columns duplicated = columns[columns.duplicated()].unique() raise AssertionError("Duplicated sids: %d" % duplicated) # Filter out columns that didn't exist from the farthest look back # window through the end of the requested dates. existed = lifetimes.any() ret = lifetimes.loc[:, existed] shape = ret.shape if shape[0] * shape[1] == 0: raise ValueError( "Found only empty asset-days between {} and {}.\n" "This probably means that either your asset db is out of date" " or that you're trying to run a Pipeline during a period with" " no market days.".format(start_date, end_date), ) return ret
def shift_dates(dates, start_date, end_date, shift): """ Shift dates of a pipeline query back by ``shift`` days. Parameters ---------- dates : DatetimeIndex All known dates. start_date : pd.Timestamp Start date of the pipeline query. end_date : pd.Timestamp End date of the pipeline query. shift : int The number of days to shift back the query dates. Returns ------- shifted : pd.DatetimeIndex The range [start_date, end_date] from ``dates``, shifted backwards by ``shift`` days. Raises ------ ValueError If ``start_date`` or ``end_date`` is not in ``dates``. NoFurtherDataError If shifting ``start_date`` back by ``shift`` days would push it off the end of ``dates``. """ try: start = dates.get_loc(start_date) except KeyError: if start_date < dates[0]: raise NoFurtherDataError( msg=( "Pipeline Query requested data starting on {query_start}, " "but first known date is {calendar_start}" ).format( query_start=str(start_date), calendar_start=str(dates[0]), ) ) else: raise ValueError("Query start %s not in calendar" % start_date) # Make sure that shifting doesn't push us out of the calendar. if start < shift: raise NoFurtherDataError( msg=( "Pipeline Query requested data from {shift}" " days before {query_start}, but first known date is only " "{start} days earlier." ).format(shift=shift, query_start=start_date, start=start), ) try: end = dates.get_loc(end_date) except KeyError: if end_date > dates[-1]: raise NoFurtherDataError( msg=( "Pipeline Query requesting data up to {query_end}, " "but last known date is {calendar_end}" ).format( query_end=end_date, calendar_end=dates[-1], ) ) else: raise ValueError("Query end %s not in calendar" % end_date) return dates[start - shift:end - shift + 1] # +1 to be inclusive
def _compute_root_mask(self, domain, start_date, end_date, extra_rows): """ Compute a lifetimes matrix from our AssetFinder, then drop columns that didn't exist at all during the query dates. Parameters ---------- domain : zipline.pipeline.domain.Domain Domain for which we're computing a pipeline. start_date : pd.Timestamp Base start date for the matrix. end_date : pd.Timestamp End date for the matrix. extra_rows : int Number of extra rows to compute before `start_date`. Extra rows are needed by terms like moving averages that require a trailing window of data. Returns ------- lifetimes : pd.DataFrame Frame of dtype `bool` containing dates from `extra_rows` days before `start_date`, continuing through to `end_date`. The returned frame contains as columns all assets in our AssetFinder that existed for at least one day between `start_date` and `end_date`. """ sessions = domain.all_sessions() if start_date not in sessions: raise ValueError( "Pipeline start date ({}) is not a trading session for " "domain {}.".format(start_date, domain)) elif end_date not in sessions: raise ValueError( "Pipeline end date {} is not a trading session for " "domain {}.".format(end_date, domain)) start_idx, end_idx = sessions.slice_locs(start_date, end_date) if start_idx < extra_rows: raise NoFurtherDataError.from_lookback_window( initial_message="Insufficient data to compute Pipeline:", first_date=sessions[0], lookback_start=start_date, lookback_length=extra_rows, ) # NOTE: This logic should probably be delegated to the domain once we # start adding more complex domains. # # Build lifetimes matrix reaching back to `extra_rows` days before # `start_date.` finder = self._finder lifetimes = finder.lifetimes( sessions[start_idx - extra_rows:end_idx], include_start_date=False, country_codes=(domain.country_code, ), ) if not lifetimes.columns.unique: columns = lifetimes.columns duplicated = columns[columns.duplicated()].unique() raise AssertionError("Duplicated sids: %d" % duplicated) # Filter out columns that didn't exist from the farthest look back # window through the end of the requested dates. existed = lifetimes.any() ret = lifetimes.loc[:, existed] num_assets = ret.shape[1] if num_assets == 0: raise ValueError( "Failed to find any assets with country_code {!r} that traded " "between {} and {}.\n" "This probably means that your asset db is old or that it has " "incorrect country/exchange metadata.".format( domain.country_code, start_date, end_date, )) return ret
def compute_extra_rows(self, all_dates, start_date, end_date, min_extra_rows): """ Ensure that min_extra_rows pushes us back to a computation date. Parameters ---------- all_dates : pd.DatetimeIndex The trading sessions against which ``self`` will be computed. start_date : pd.Timestamp The first date for which final output is requested. end_date : pd.Timestamp The last date for which final output is requested. min_extra_rows : int The minimum number of extra rows required of ``self``, as determined by other terms that depend on ``self``. Returns ------- extra_rows : int The number of extra rows to compute. This will be the minimum number of rows required to make our computed start_date fall on a recomputation date. """ try: current_start_pos = all_dates.get_loc(start_date) - min_extra_rows if current_start_pos < 0: raise NoFurtherDataError.from_lookback_window( initial_message="Insufficient data to compute Pipeline:", first_date=all_dates[0], lookback_start=start_date, lookback_length=min_extra_rows, ) except KeyError: before, after = nearest_unequal_elements(all_dates, start_date) raise ValueError( "Pipeline start_date {start_date} is not in calendar.\n" "Latest date before start_date is {before}.\n" "Earliest date after start_date is {after}.".format( start_date=start_date, before=before, after=after, ) ) # Our possible target dates are all the dates on or before the current # starting position. # TODO: Consider bounding this below by self.window_length candidates = all_dates[:current_start_pos + 1] # Choose the latest date in the candidates that is the start of a new # period at our frequency. choices = select_sampling_indices(candidates, self._frequency) # If we have choices, the last choice is the first date if the # period containing current_start_date. Choose it. new_start_date = candidates[choices[-1]] # Add the difference between the new and old start dates to get the # number of rows for the new start_date. new_start_pos = all_dates.get_loc(new_start_date) assert new_start_pos <= current_start_pos, \ "Computed negative extra rows!" return min_extra_rows + (current_start_pos - new_start_pos)
def compute_extra_rows(self, all_dates, start_date, end_date, min_extra_rows): """ Ensure that min_extra_rows pushes us back to a computation date. Parameters ---------- all_dates : pd.DatetimeIndex The trading sessions against which ``self`` will be computed. start_date : pd.Timestamp The first date for which final output is requested. end_date : pd.Timestamp The last date for which final output is requested. min_extra_rows : int The minimum number of extra rows required of ``self``, as determined by other terms that depend on ``self``. Returns ------- extra_rows : int The number of extra rows to compute. This will be the minimum number of rows required to make our computed start_date fall on a recomputation date. """ try: current_start_pos = all_dates.get_loc(start_date) - min_extra_rows if current_start_pos < 0: raise NoFurtherDataError.from_lookback_window( initial_message="Insufficient data to compute Pipeline:", first_date=all_dates[0], lookback_start=start_date, lookback_length=min_extra_rows, ) except KeyError: before, after = nearest_unequal_elements(all_dates, start_date) raise ValueError( "Pipeline start_date {start_date} is not in calendar.\n" "Latest date before start_date is {before}.\n" "Earliest date after start_date is {after}.".format( start_date=start_date, before=before, after=after, )) # Our possible target dates are all the dates on or before the current # starting position. # TODO: Consider bounding this below by self.window_length candidates = all_dates[:current_start_pos + 1] # Choose the latest date in the candidates that is the start of a new # period at our frequency. choices = select_sampling_indices(candidates, self._frequency) # If we have choices, the last choice is the first date if the # period containing current_start_date. Choose it. new_start_date = candidates[choices[-1]] # Add the difference between the new and old start dates to get the # number of rows for the new start_date. new_start_pos = all_dates.get_loc(new_start_date) assert new_start_pos <= current_start_pos, \ "Computed negative extra rows!" return min_extra_rows + (current_start_pos - new_start_pos)
def _compute_root_mask(self, domain, start_date, end_date, extra_rows): """ Compute a lifetimes matrix from our AssetFinder, then drop columns that didn't exist at all during the query dates. Parameters ---------- domain : zipline.pipeline.domain.Domain Domain for which we're computing a pipeline. start_date : pd.Timestamp Base start date for the matrix. end_date : pd.Timestamp End date for the matrix. extra_rows : int Number of extra rows to compute before `start_date`. Extra rows are needed by terms like moving averages that require a trailing window of data. Returns ------- lifetimes : pd.DataFrame Frame of dtype `bool` containing dates from `extra_rows` days before `start_date`, continuing through to `end_date`. The returned frame contains as columns all assets in our AssetFinder that existed for at least one day between `start_date` and `end_date`. """ sessions = domain.all_sessions() if start_date not in sessions: raise ValueError( "Pipeline start date ({}) is not a trading session for " "domain {}.".format(start_date, domain) ) elif end_date not in sessions: raise ValueError( "Pipeline end date {} is not a trading session for " "domain {}.".format(end_date, domain) ) start_idx, end_idx = sessions.slice_locs(start_date, end_date) if start_idx < extra_rows: raise NoFurtherDataError.from_lookback_window( initial_message="Insufficient data to compute Pipeline:", first_date=sessions[0], lookback_start=start_date, lookback_length=extra_rows, ) # NOTE: This logic should probably be delegated to the domain once we # start adding more complex domains. # # Build lifetimes matrix reaching back to `extra_rows` days before # `start_date.` finder = self._finder lifetimes = finder.lifetimes( sessions[start_idx - extra_rows:end_idx], include_start_date=False, country_codes=(domain.country_code,), ) if not lifetimes.columns.unique: columns = lifetimes.columns duplicated = columns[columns.duplicated()].unique() raise AssertionError("Duplicated sids: %d" % duplicated) # Filter out columns that didn't exist from the farthest look back # window through the end of the requested dates. existed = lifetimes.any() ret = lifetimes.loc[:, existed] num_assets = ret.shape[1] if num_assets == 0: raise ValueError( "Failed to find any assets with country_code {!r} that traded " "between {} and {}.\n" "This probably means that your asset db is old or that it has " "incorrect country/exchange metadata.".format( domain.country_code, start_date, end_date, ) ) return ret