示例#1
0
 def test_reasonable_keyerror(self):
     # GH #1062
     index = DatetimeIndex(['1/3/2000'])
     try:
         index.get_loc('1/1/2000')
     except KeyError as e:
         assert '2000' in str(e)
示例#2
0
 def test_reasonable_keyerror(self):
     # GH #1062
     index = DatetimeIndex(["1/3/2000"])
     try:
         index.get_loc("1/1/2000")
     except KeyError, e:
         self.assert_("2000" in str(e))
示例#3
0
class BcolzDailyBarReader(object):
    """
    Reader for raw pricing data written by BcolzDailyOHLCVWriter.

    A Bcolz CTable is comprised of Columns and Attributes.

    Columns
    -------
    The table with which this loader interacts contains the following columns:

    ['open', 'high', 'low', 'close', 'volume', 'day', 'id'].

    The data in these columns is interpreted as follows:

    - Price columns ('open', 'high', 'low', 'close') are interpreted as 1000 *
      as-traded dollar value.
    - Volume is interpreted as as-traded volume.
    - Day is interpreted as seconds since midnight UTC, Jan 1, 1970.
    - Id is the asset id of the row.

    The data in each column is grouped by asset and then sorted by day within
    each asset block.

    The table is built to represent a long time range of data, e.g. ten years
    of equity data, so the lengths of each asset block is not equal to each
    other. The blocks are clipped to the known start and end date of each asset
    to cut down on the number of empty values that would need to be included to
    make a regular/cubic dataset.

    When read across the open, high, low, close, and volume with the same
    index should represent the same asset and day.

    Attributes
    ----------
    The table with which this loader interacts contains the following
    attributes:

    first_row : dict
        Map from asset_id -> index of first row in the dataset with that id.
    last_row : dict
        Map from asset_id -> index of last row in the dataset with that id.
    calendar_offset : dict
        Map from asset_id -> calendar index of first row.
    calendar : list[int64]
        Calendar used to compute offsets, in asi8 format (ns since EPOCH).

    We use first_row and last_row together to quickly find ranges of rows to
    load when reading an asset's data into memory.

    We use calendar_offset and calendar to orient loaded blocks within a
    range of queried dates.
    """
    def __init__(self, table):
        if isinstance(table, string_types):
            table = ctable(rootdir=table, mode='r')

        self._table = table
        self._calendar = DatetimeIndex(table.attrs['calendar'], tz='UTC')
        self._first_rows = {
            int(asset_id): start_index
            for asset_id, start_index in iteritems(table.attrs['first_row'])
        }
        self._last_rows = {
            int(asset_id): end_index
            for asset_id, end_index in iteritems(table.attrs['last_row'])
        }
        self._calendar_offsets = {
            int(id_): offset
            for id_, offset in iteritems(table.attrs['calendar_offset'])
        }
        # Cache of fully read np.array for the carrays in the daily bar table.
        # raw_array does not use the same cache, but it could.
        # Need to test keeping the entire array in memory for the course of a
        # process first.
        self._spot_cols = {}

    def _compute_slices(self, start_idx, end_idx, assets):
        """
        Compute the raw row indices to load for each asset on a query for the
        given dates after applying a shift.

        Parameters
        ----------
        start_idx : int
            Index of first date for which we want data.
        end_idx : int
            Index of last date for which we want data.
        assets : pandas.Int64Index
            Assets for which we want to compute row indices

        Returns
        -------
        A 3-tuple of (first_rows, last_rows, offsets):
        first_rows : np.array[intp]
            Array with length == len(assets) containing the index of the first
            row to load for each asset in `assets`.
        last_rows : np.array[intp]
            Array with length == len(assets) containing the index of the last
            row to load for each asset in `assets`.
        offset : np.array[intp]
            Array with length == (len(asset) containing the index in a buffer
            of length `dates` corresponding to the first row of each asset.

            The value of offset[i] will be 0 if asset[i] existed at the start
            of a query.  Otherwise, offset[i] will be equal to the number of
            entries in `dates` for which the asset did not yet exist.
        """
        # The core implementation of the logic here is implemented in Cython
        # for efficiency.
        return _compute_row_slices(
            self._first_rows,
            self._last_rows,
            self._calendar_offsets,
            start_idx,
            end_idx,
            assets,
        )

    def load_raw_arrays(self, columns, start_date, end_date, assets):
        # Assumes that the given dates are actually in calendar.
        start_idx = self._calendar.get_loc(start_date)
        end_idx = self._calendar.get_loc(end_date)
        first_rows, last_rows, offsets = self._compute_slices(
            start_idx,
            end_idx,
            assets,
        )
        return _read_bcolz_data(
            self._table,
            (end_idx - start_idx + 1, len(assets)),
            [column.name for column in columns],
            first_rows,
            last_rows,
            offsets,
        )

    def _spot_col(self, colname):
        """
        Get the colname from daily_bar_table and read all of it into memory,
        caching the result.

        Parameters
        ----------
        colname : string
            A name of a OHLCV carray in the daily_bar_table

        Returns
        -------
        array (uint32)
            Full read array of the carray in the daily_bar_table with the
            given colname.
        """
        try:
            col = self._spot_cols[colname]
        except KeyError:
            col = self._spot_cols[colname] = self._table[colname][:]
        return col

    def spot_price(self, sid, day, colname):
        """
        Parameters
        ----------
        sid : int
            The asset identifier.
        day : datetime64
            Midnight of the day for which data is requested.
        colname : string
            The price field. e.g. ('open', 'high', 'low', 'close', 'volume')

        Returns
        -------
        float
            The spot price for colname of the given sid on the given day.
            Raises a NoDataOnDate exception if there is no data available
            for the given day and sid.
        """
        day_loc = self._calendar.get_loc(day)
        offset = day_loc - self._calendar_offsets[sid]
        if offset < 0:
            raise NoDataOnDate(
                "No data on or before day={0} for sid={1}".format(
                    day, sid))
        ix = self._first_rows[sid] + offset
        if ix > self._last_rows[sid]:
            raise NoDataOnDate(
                "No data on or after day={0} for sid={1}".format(
                    day, sid))
        return self._spot_col(colname)[ix] * 0.001
示例#4
0
class BcolzDailyBarReader(object):
    """
    Reader for raw pricing data written by BcolzDailyOHLCVWriter.

    A Bcolz CTable is comprised of Columns and Attributes.

    Columns
    -------
    The table with which this loader interacts contains the following columns:

    ['open', 'high', 'low', 'close', 'volume', 'day', 'id'].

    The data in these columns is interpreted as follows:

    - Price columns ('open', 'high', 'low', 'close') are interpreted as 1000 *
      as-traded dollar value.
    - Volume is interpreted as as-traded volume.
    - Day is interpreted as seconds since midnight UTC, Jan 1, 1970.
    - Id is the asset id of the row.

    The data in each column is grouped by asset and then sorted by day within
    each asset block.

    The table is built to represent a long time range of data, e.g. ten years
    of equity data, so the lengths of each asset block is not equal to each
    other. The blocks are clipped to the known start and end date of each asset
    to cut down on the number of empty values that would need to be included to
    make a regular/cubic dataset.

    When read across the open, high, low, close, and volume with the same
    index should represent the same asset and day.

    Attributes
    ----------
    The table with which this loader interacts contains the following
    attributes:

    first_row : dict
        Map from asset_id -> index of first row in the dataset with that id.
    last_row : dict
        Map from asset_id -> index of last row in the dataset with that id.
    calendar_offset : dict
        Map from asset_id -> calendar index of first row.
    calendar : list[int64]
        Calendar used to compute offsets, in asi8 format (ns since EPOCH).

    We use first_row and last_row together to quickly find ranges of rows to
    load when reading an asset's data into memory.

    We use calendar_offset and calendar to orient loaded blocks within a
    range of queried dates.
    """
    def __init__(self, table):
        if isinstance(table, string_types):
            table = ctable(rootdir=table, mode='r')

        self._table = table
        self._calendar = DatetimeIndex(table.attrs['calendar'], tz='UTC')
        self._first_rows = {
            int(asset_id): start_index
            for asset_id, start_index in iteritems(table.attrs['first_row'])
        }
        self._last_rows = {
            int(asset_id): end_index
            for asset_id, end_index in iteritems(table.attrs['last_row'])
        }
        self._calendar_offsets = {
            int(id_): offset
            for id_, offset in iteritems(table.attrs['calendar_offset'])
        }

    def _slice_locs(self, start_date, end_date):
        try:
            start = self._calendar.get_loc(start_date)
        except KeyError:
            if start_date < self._calendar[0]:
                raise NoFurtherDataError(msg=(
                    "FFC Query requesting data starting on {query_start}, "
                    "but first known date is {calendar_start}").format(
                        query_start=str(start_date),
                        calendar_start=str(self._calendar[0]),
                    ))
            else:
                raise ValueError("Query start %s not in calendar" % start_date)
        try:
            stop = self._calendar.get_loc(end_date)
        except:
            if end_date > self._calendar[-1]:
                raise NoFurtherDataError(
                    msg=("FFC Query requesting data up to {query_end}, "
                         "but last known date is {calendar_end}").format(
                             query_end=end_date,
                             calendar_end=self._calendar[-1],
                         ))
            else:
                raise ValueError("Query end %s not in calendar" % end_date)
        return start, stop

    def _compute_slices(self, dates, assets):
        """
        Compute the raw row indices to load for each asset on a query for the
        given dates.

        Parameters
        ----------
        dates : pandas.DatetimeIndex
            Dates of the query on which we want to compute row indices.
        assets : pandas.Int64Index
            Assets for which we want to compute row indices

        Returns
        -------
        A 3-tuple of (first_rows, last_rows, offsets):
        first_rows : np.array[intp]
            Array with length == len(assets) containing the index of the first
            row to load for each asset in `assets`.
        last_rows : np.array[intp]
            Array with length == len(assets) containing the index of the last
            row to load for each asset in `assets`.
        offset : np.array[intp]
            Array with length == (len(asset) containing the index in a buffer
            of length `dates` corresponding to the first row of each asset.

            The value of offset[i] will be 0 if asset[i] existed at the start
            of a query.  Otherwise, offset[i] will be equal to the number of
            entries in `dates` for which the asset did not yet exist.
        """
        start, stop = self._slice_locs(dates[0], dates[-1])

        # Sanity check that the requested date range matches our calendar.
        # This could be removed in the future if it's materially affecting
        # performance.
        query_dates = self._calendar[start:stop + 1]
        if not array_equal(query_dates.values, dates.values):
            raise ValueError("Incompatible calendars!")

        # The core implementation of the logic here is implemented in Cython
        # for efficiency.
        return _compute_row_slices(
            self._first_rows,
            self._last_rows,
            self._calendar_offsets,
            start,
            stop,
            assets,
        )

    def load_raw_arrays(self, columns, dates, assets):
        first_rows, last_rows, offsets = self._compute_slices(dates, assets)
        return _read_bcolz_data(
            self._table,
            (len(dates), len(assets)),
            [column.name for column in columns],
            first_rows,
            last_rows,
            offsets,
        )
示例#5
0
 def test_get_loc_reasonable_key_error(self):
     # GH#1062
     index = DatetimeIndex(["1/3/2000"])
     with pytest.raises(KeyError, match="2000"):
         index.get_loc("1/1/2000")
示例#6
0
class BcolzDailyBarReader(object):
    """
    Reader for raw pricing data written by BcolzDailyOHLCVWriter.

    A Bcolz CTable is comprised of Columns and Attributes.

    Columns
    -------
    The table with which this loader interacts contains the following columns:

    ['open', 'high', 'low', 'close', 'volume', 'day', 'id'].

    The data in these columns is interpreted as follows:

    - Price columns ('open', 'high', 'low', 'close') are interpreted as 1000 *
      as-traded dollar value.
    - Volume is interpreted as as-traded volume.
    - Day is interpreted as seconds since midnight UTC, Jan 1, 1970.
    - Id is the asset id of the row.

    The data in each column is grouped by asset and then sorted by day within
    each asset block.

    The table is built to represent a long time range of data, e.g. ten years
    of equity data, so the lengths of each asset block is not equal to each
    other. The blocks are clipped to the known start and end date of each asset
    to cut down on the number of empty values that would need to be included to
    make a regular/cubic dataset.

    When read across the open, high, low, close, and volume with the same
    index should represent the same asset and day.

    Attributes
    ----------
    The table with which this loader interacts contains the following
    attributes:

    first_row : dict
        Map from asset_id -> index of first row in the dataset with that id.
    last_row : dict
        Map from asset_id -> index of last row in the dataset with that id.
    calendar_offset : dict
        Map from asset_id -> calendar index of first row.
    calendar : list[int64]
        Calendar used to compute offsets, in asi8 format (ns since EPOCH).

    We use first_row and last_row together to quickly find ranges of rows to
    load when reading an asset's data into memory.

    We use calendar_offset and calendar to orient loaded blocks within a
    range of queried dates.
    """
    def __init__(self, table):
        if isinstance(table, string_types):
            table = ctable(rootdir=table, mode='r')

        self._table = table
        self._calendar = DatetimeIndex(table.attrs['calendar'], tz='UTC')
        self._first_rows = {
            int(asset_id): start_index
            for asset_id, start_index in iteritems(table.attrs['first_row'])
        }
        self._last_rows = {
            int(asset_id): end_index
            for asset_id, end_index in iteritems(table.attrs['last_row'])
        }
        self._calendar_offsets = {
            int(id_): offset
            for id_, offset in iteritems(table.attrs['calendar_offset'])
        }
        # Cache of fully read np.array for the carrays in the daily bar table.
        # raw_array does not use the same cache, but it could.
        # Need to test keeping the entire array in memory for the course of a
        # process first.
        self._spot_cols = {}

    def _compute_slices(self, start_idx, end_idx, assets):
        """
        Compute the raw row indices to load for each asset on a query for the
        given dates after applying a shift.

        Parameters
        ----------
        start_idx : int
            Index of first date for which we want data.
        end_idx : int
            Index of last date for which we want data.
        assets : pandas.Int64Index
            Assets for which we want to compute row indices

        Returns
        -------
        A 3-tuple of (first_rows, last_rows, offsets):
        first_rows : np.array[intp]
            Array with length == len(assets) containing the index of the first
            row to load for each asset in `assets`.
        last_rows : np.array[intp]
            Array with length == len(assets) containing the index of the last
            row to load for each asset in `assets`.
        offset : np.array[intp]
            Array with length == (len(asset) containing the index in a buffer
            of length `dates` corresponding to the first row of each asset.

            The value of offset[i] will be 0 if asset[i] existed at the start
            of a query.  Otherwise, offset[i] will be equal to the number of
            entries in `dates` for which the asset did not yet exist.
        """
        # The core implementation of the logic here is implemented in Cython
        # for efficiency.
        return _compute_row_slices(
            self._first_rows,
            self._last_rows,
            self._calendar_offsets,
            start_idx,
            end_idx,
            assets,
        )

    def load_raw_arrays(self, columns, start_date, end_date, assets):
        # Assumes that the given dates are actually in calendar.
        start_idx = self._calendar.get_loc(start_date)
        end_idx = self._calendar.get_loc(end_date)
        first_rows, last_rows, offsets = self._compute_slices(
            start_idx,
            end_idx,
            assets,
        )
        return _read_bcolz_data(
            self._table,
            (end_idx - start_idx + 1, len(assets)),
            [column.name for column in columns],
            first_rows,
            last_rows,
            offsets,
        )

    def _spot_col(self, colname):
        """
        Get the colname from daily_bar_table and read all of it into memory,
        caching the result.

        Parameters
        ----------
        colname : string
            A name of a OHLCV carray in the daily_bar_table

        Returns
        -------
        array (uint32)
            Full read array of the carray in the daily_bar_table with the
            given colname.
        """
        try:
            col = self._spot_cols[colname]
        except KeyError:
            col = self._spot_cols[colname] = self._table[colname][:]
        return col

    def spot_price(self, sid, day, colname):
        """
        Parameters
        ----------
        sid : int
            The asset identifier.
        day : datetime64
            Midnight of the day for which data is requested.
        colname : string
            The price field. e.g. ('open', 'high', 'low', 'close', 'volume')

        Returns
        -------
        float
            The spot price for colname of the given sid on the given day.
            Raises a NoDataOnDate exception if there is no data available
            for the given day and sid.
        """
        day_loc = self._calendar.get_loc(day)
        offset = day_loc - self._calendar_offsets[sid]
        if offset < 0:
            raise NoDataOnDate(
                "No data on or before day={0} for sid={1}".format(day, sid))
        ix = self._first_rows[sid] + offset
        if ix > self._last_rows[sid]:
            raise NoDataOnDate(
                "No data on or after day={0} for sid={1}".format(day, sid))
        return self._spot_col(colname)[ix] * 0.001
示例#7
0
 def test_get_loc_nat(self):
     # GH#20464
     index = DatetimeIndex(["1/3/2000", "NaT"])
     assert index.get_loc(pd.NaT) == 1
示例#8
0
 def test_reasonable_key_error(self):
     # GH#1062
     index = DatetimeIndex(['1/3/2000'])
     with pytest.raises(KeyError, match='2000'):
         index.get_loc('1/1/2000')
示例#9
0
 def test_get_loc_nat(self):
     # GH#20464
     index = DatetimeIndex(['1/3/2000', 'NaT'])
     assert index.get_loc(pd.NaT) == 1
示例#10
0
class BcolzDailyBarReader(object):
    """
    Reader for raw pricing data written by BcolzDailyOHLCVWriter.

    A Bcolz CTable is comprised of Columns and Attributes.

    Columns
    -------
    The table with which this loader interacts contains the following columns:

    ['open', 'high', 'low', 'close', 'volume', 'day', 'id'].

    The data in these columns is interpreted as follows:

    - Price columns ('open', 'high', 'low', 'close') are interpreted as 1000 *
      as-traded dollar value.
    - Volume is interpreted as as-traded volume.
    - Day is interpreted as seconds since midnight UTC, Jan 1, 1970.
    - Id is the asset id of the row.

    The data in each column is grouped by asset and then sorted by day within
    each asset block.

    The table is built to represent a long time range of data, e.g. ten years
    of equity data, so the lengths of each asset block is not equal to each
    other. The blocks are clipped to the known start and end date of each asset
    to cut down on the number of empty values that would need to be included to
    make a regular/cubic dataset.

    When read across the open, high, low, close, and volume with the same
    index should represent the same asset and day.

    Attributes
    ----------
    The table with which this loader interacts contains the following
    attributes:

    first_row : dict
        Map from asset_id -> index of first row in the dataset with that id.
    last_row : dict
        Map from asset_id -> index of last row in the dataset with that id.
    calendar_offset : dict
        Map from asset_id -> calendar index of first row.
    calendar : list[int64]
        Calendar used to compute offsets, in asi8 format (ns since EPOCH).

    We use first_row and last_row together to quickly find ranges of rows to
    load when reading an asset's data into memory.

    We use calendar_offset and calendar to orient loaded blocks within a
    range of queried dates.
    """
    def __init__(self, table):
        if isinstance(table, string_types):
            table = ctable(rootdir=table, mode='r')

        self._table = table
        self._calendar = DatetimeIndex(table.attrs['calendar'], tz='UTC')
        self._first_rows = {
            int(asset_id): start_index
            for asset_id, start_index in iteritems(table.attrs['first_row'])
        }
        self._last_rows = {
            int(asset_id): end_index
            for asset_id, end_index in iteritems(table.attrs['last_row'])
        }
        self._calendar_offsets = {
            int(id_): offset
            for id_, offset in iteritems(table.attrs['calendar_offset'])
        }

    def _compute_slices(self, start_idx, end_idx, assets):
        """
        Compute the raw row indices to load for each asset on a query for the
        given dates after applying a shift.

        Parameters
        ----------
        start_idx : int
            Index of first date for which we want data.
        end_idx : int
            Index of last date for which we want data.
        assets : pandas.Int64Index
            Assets for which we want to compute row indices

        Returns
        -------
        A 3-tuple of (first_rows, last_rows, offsets):
        first_rows : np.array[intp]
            Array with length == len(assets) containing the index of the first
            row to load for each asset in `assets`.
        last_rows : np.array[intp]
            Array with length == len(assets) containing the index of the last
            row to load for each asset in `assets`.
        offset : np.array[intp]
            Array with length == (len(asset) containing the index in a buffer
            of length `dates` corresponding to the first row of each asset.

            The value of offset[i] will be 0 if asset[i] existed at the start
            of a query.  Otherwise, offset[i] will be equal to the number of
            entries in `dates` for which the asset did not yet exist.
        """
        # The core implementation of the logic here is implemented in Cython
        # for efficiency.
        return _compute_row_slices(
            self._first_rows,
            self._last_rows,
            self._calendar_offsets,
            start_idx,
            end_idx,
            assets,
        )

    def load_raw_arrays(self, columns, start_date, end_date, assets):
        # Assumes that the given dates are actually in calendar.
        start_idx = self._calendar.get_loc(start_date)
        end_idx = self._calendar.get_loc(end_date)
        first_rows, last_rows, offsets = self._compute_slices(
            start_idx,
            end_idx,
            assets,
        )
        return _read_bcolz_data(
            self._table,
            (end_idx - start_idx + 1, len(assets)),
            [column.name for column in columns],
            first_rows,
            last_rows,
            offsets,
        )
示例#11
0
    def update(self,
               index: pd.DatetimeIndex,
               values: np.ndarray = None,
               conf_lo: np.ndarray = None,
               conf_hi: np.ndarray = None,
               inplace: bool = True) -> 'TimeSeries':
        """
        Updates the Series with the new values provided.
        If indices are not in original TimeSeries, they will be discarded.
        At least one parameter other than index must be filled.
        Use np.nan to ignore a specific index in a series.

        It will raise an error if try to update a missing CI series

        :param index: A DateTimeIndex containing the indices to replace.
        :param values: An array containing the values to replace (optional).
        :param conf_lo: The lower confidence interval values to change (optional).
        :param conf_hi: The higher confidence interval values (optional).
        :param inplace: If True, do operation inplace and return None, defaults to True.
        :return: A TimeSeries with values updated

        TODO: Do we need this method? Where/how is it used? We should avoid mutating values at all cost.
        """
        raise_if_not(not (values is None and conf_lo is None and conf_hi is None), "At least one parameter must be filled " \
                                                                             "other than index", logger)
        raise_if_not(not index is None, "Index must be filled.")
        if (not values is None):
            raise_if_not(len(values) == len(index), \
                "The number of values must correspond to the number of indices: {} != {}".format(len(values), len(index)), logger)
        if (not conf_lo is None):
            raise_if_not(len(conf_lo) == len(index), \
                "The number of values must correspond to the number of indices: {} != {}".format(len(conf_lo), len(index)), logger)
        if (not conf_hi is None):
            raise_if_not(len(conf_hi) == len(index), \
                "The number of values must correspond to the number of indices: {} != {}".format(len(conf_hi), len(index)), logger)
        ignored_indices = [
            index.get_loc(ind) for ind in (set(index) - set(self.time_index()))
        ]
        index = index.delete(ignored_indices)
        series = values if values is None else pd.Series(
            np.delete(values, ignored_indices), index=index)
        conf_lo = conf_lo if conf_lo is None else pd.Series(
            np.delete(conf_lo, ignored_indices), index=index)
        conf_hi = conf_hi if conf_hi is None else pd.Series(
            np.delete(conf_hi, ignored_indices), index=index)
        raise_if_not(
            len(index) > 0, "Must give at least one correct index.", logger)
        if inplace:
            if series is not None:
                self._series.update(series)
            if conf_lo is not None:
                self._confidence_lo.update(conf_lo)
            if conf_hi is not None:
                self._confidence_hi.update(conf_hi)
            return None
        else:
            new_series = self.pd_series()
            new_lo = self.conf_lo_pd_series()
            new_hi = self.conf_hi_pd_series()
            if series is not None:
                new_series.update(series)
            if conf_lo is not None:
                new_lo.update(conf_lo)
            if conf_hi is not None:
                new_hi.update(conf_hi)
            return TimeSeries(new_series, new_lo, new_hi)
class BcolzDailyBarReader(object):
    """
    Reader for raw pricing data written by BcolzDailyOHLCVWriter.

    A Bcolz CTable is comprised of Columns and Attributes.

    Columns
    -------
    The table with which this loader interacts contains the following columns:

    ['open', 'high', 'low', 'close', 'volume', 'day', 'id'].

    The data in these columns is interpreted as follows:

    - Price columns ('open', 'high', 'low', 'close') are interpreted as 1000 *
      as-traded dollar value.
    - Volume is interpreted as as-traded volume.
    - Day is interpreted as seconds since midnight UTC, Jan 1, 1970.
    - Id is the asset id of the row.

    The data in each column is grouped by asset and then sorted by day within
    each asset block.

    The table is built to represent a long time range of data, e.g. ten years
    of equity data, so the lengths of each asset block is not equal to each
    other. The blocks are clipped to the known start and end date of each asset
    to cut down on the number of empty values that would need to be included to
    make a regular/cubic dataset.

    When read across the open, high, low, close, and volume with the same
    index should represent the same asset and day.

    Attributes
    ----------
    The table with which this loader interacts contains the following
    attributes:

    first_row : dict
        Map from asset_id -> index of first row in the dataset with that id.
    last_row : dict
        Map from asset_id -> index of last row in the dataset with that id.
    calendar_offset : dict
        Map from asset_id -> calendar index of first row.
    calendar : list[int64]
        Calendar used to compute offsets, in asi8 format (ns since EPOCH).

    We use first_row and last_row together to quickly find ranges of rows to
    load when reading an asset's data into memory.

    We use calendar_offset and calendar to orient loaded blocks within a
    range of queried dates.
    """
    def __init__(self, table):
        if isinstance(table, string_types):
            table = ctable(rootdir=table, mode='r')

        self._table = table
        self._calendar = DatetimeIndex(table.attrs['calendar'], tz='UTC')
        self._first_rows = {
            int(asset_id): start_index
            for asset_id, start_index in iteritems(table.attrs['first_row'])
        }
        self._last_rows = {
            int(asset_id): end_index
            for asset_id, end_index in iteritems(table.attrs['last_row'])
        }
        self._calendar_offsets = {
            int(id_): offset
            for id_, offset in iteritems(table.attrs['calendar_offset'])
        }

    def _slice_locs(self, start_date, end_date):
        try:
            start = self._calendar.get_loc(start_date)
        except KeyError:
            if start_date < self._calendar[0]:
                raise NoFurtherDataError(
                    msg=(
                        "FFC Query requesting data starting on {query_start}, "
                        "but first known date is {calendar_start}"
                    ).format(
                        query_start=str(start_date),
                        calendar_start=str(self._calendar[0]),
                    )
                )
            else:
                raise ValueError("Query start %s not in calendar" % start_date)
        try:
            stop = self._calendar.get_loc(end_date)
        except:
            if end_date > self._calendar[-1]:
                raise NoFurtherDataError(
                    msg=(
                        "FFC Query requesting data up to {query_end}, "
                        "but last known date is {calendar_end}"
                    ).format(
                        query_end=end_date,
                        calendar_end=self._calendar[-1],
                    )
                )
            else:
                raise ValueError("Query end %s not in calendar" % end_date)
        return start, stop

    def _compute_slices(self, dates, assets):
        """
        Compute the raw row indices to load for each asset on a query for the
        given dates.

        Parameters
        ----------
        dates : pandas.DatetimeIndex
            Dates of the query on which we want to compute row indices.
        assets : pandas.Int64Index
            Assets for which we want to compute row indices

        Returns
        -------
        A 3-tuple of (first_rows, last_rows, offsets):
        first_rows : np.array[intp]
            Array with length == len(assets) containing the index of the first
            row to load for each asset in `assets`.
        last_rows : np.array[intp]
            Array with length == len(assets) containing the index of the last
            row to load for each asset in `assets`.
        offset : np.array[intp]
            Array with length == (len(asset) containing the index in a buffer
            of length `dates` corresponding to the first row of each asset.

            The value of offset[i] will be 0 if asset[i] existed at the start
            of a query.  Otherwise, offset[i] will be equal to the number of
            entries in `dates` for which the asset did not yet exist.
        """
        start, stop = self._slice_locs(dates[0], dates[-1])

        # Sanity check that the requested date range matches our calendar.
        # This could be removed in the future if it's materially affecting
        # performance.
        query_dates = self._calendar[start:stop + 1]
        if not array_equal(query_dates.values, dates.values):
            raise ValueError("Incompatible calendars!")

        # The core implementation of the logic here is implemented in Cython
        # for efficiency.
        return _compute_row_slices(
            self._first_rows,
            self._last_rows,
            self._calendar_offsets,
            start,
            stop,
            assets,
        )

    def load_raw_arrays(self, columns, dates, assets):
        first_rows, last_rows, offsets = self._compute_slices(dates, assets)
        return _read_bcolz_data(
            self._table,
            (len(dates), len(assets)),
            [column.name for column in columns],
            first_rows,
            last_rows,
            offsets,
        )
示例#13
0
 def test_get_loc_nat(self):
     # GH#20464
     index = DatetimeIndex(['1/3/2000', 'NaT'])
     assert index.get_loc(pd.NaT) == 1
示例#14
0
 def test_reasonable_keyerror(self):
     # GH#1062
     index = DatetimeIndex(['1/3/2000'])
     with pytest.raises(KeyError) as excinfo:
         index.get_loc('1/1/2000')
     assert '2000' in str(excinfo.value)
 def test_reasonable_key_error(self):
     # GH#1062
     index = DatetimeIndex(['1/3/2000'])
     with pytest.raises(KeyError, match='2000'):
         index.get_loc('1/1/2000')
示例#16
0
 def test_reasonable_keyerror(self):
     # GH#1062
     index = DatetimeIndex(['1/3/2000'])
     with pytest.raises(KeyError) as excinfo:
         index.get_loc('1/1/2000')
     assert '2000' in str(excinfo.value)