示例#1
0
def merge_ownership_periods(mappings):
    """
    Given a dict of mappings where the values are lists of
    OwnershipPeriod objects, returns a dict with the same structure with
    new OwnershipPeriod objects adjusted so that the periods have no
    gaps.

    Orders the periods chronologically, and pushes forward the end date
    of each period to match the start date of the following period. The
    end date of the last period pushed forward to the max Timestamp.
    """
    return valmap(
        lambda v: tuple(
            OwnershipPeriod(
                a.start,
                b.start,
                a.sid,
                a.value,
            ) for a, b in sliding_window(
                2,
                concatv(
                    sorted(v),
                    # concat with a fake ownership object to make the last
                    # end date be max timestamp
                    [OwnershipPeriod(
                        safe_tz_localize(pd.Timestamp.max, 'utc'),
                        None,
                        None,
                        None,
                    )],
                ),
            )
        ),
        mappings,
    )
示例#2
0
def load_from_yahoo(indexes=None,
                    stocks=None,
                    start=None,
                    end=None,
                    adjusted=True):
    """
    Loads price data from Yahoo into a dataframe for each of the indicated
    assets.  By default, 'price' is taken from Yahoo's 'Adjusted Close',
    which removes the impact of splits and dividends. If the argument
    'adjusted' is False, then the non-adjusted 'close' field is used instead.

    :param indexes: Financial indexes to load.
    :type indexes: dict
    :param stocks: Stock closing prices to load.
    :type stocks: list
    :param start: Retrieve prices from start date on.
    :type start: datetime
    :param end: Retrieve prices until end date.
    :type end: datetime
    :param adjusted: Adjust the price for splits and dividends.
    :type adjusted: bool

    """
    data = _load_raw_yahoo_data(indexes, stocks, start, end)
    if adjusted:
        close_key = 'Adj Close'
    else:
        close_key = 'Close'
    df = pd.DataFrame({key: d[close_key] for key, d in iteritems(data)})
    df.index = safe_tz_localize(df.index, pytz.utc)
    return df
示例#3
0
    def test_nearest_unequal_elements(self, tz):

        dts = safe_tz_localize(
            pd.to_datetime(
                ['2014-01-01', '2014-01-05', '2014-01-06', '2014-01-09']), tz)

        def t(s):
            return None if s is None else pd.Timestamp(s, tz=tz)

        for dt, before, after in (('2013-12-30', None,
                                   '2014-01-01'), ('2013-12-31', None,
                                                   '2014-01-01'),
                                  ('2014-01-01', None,
                                   '2014-01-05'), ('2014-01-02', '2014-01-01',
                                                   '2014-01-05'),
                                  ('2014-01-03', '2014-01-01',
                                   '2014-01-05'), ('2014-01-04', '2014-01-01',
                                                   '2014-01-05'),
                                  ('2014-01-05', '2014-01-01',
                                   '2014-01-06'), ('2014-01-06', '2014-01-05',
                                                   '2014-01-09'),
                                  ('2014-01-07', '2014-01-06',
                                   '2014-01-09'), ('2014-01-08', '2014-01-06',
                                                   '2014-01-09'),
                                  ('2014-01-09', '2014-01-06',
                                   None), ('2014-01-10', '2014-01-09',
                                           None), ('2014-01-11', '2014-01-09',
                                                   None)):
            computed = nearest_unequal_elements(dts, t(dt))
            expected = (t(before), t(after))
            self.assertEqual(computed, expected)
示例#4
0
 def init_class_fixtures(cls):
     super(PipelineAlgorithmTestCase, cls).init_class_fixtures()
     cls.pipeline_loader = USEquityPricingLoader(
         cls.bcolz_equity_daily_bar_reader,
         cls.adjustment_reader,
         USEquityPricing,
     )
     cls.dates = safe_tz_localize(cls.raw_data[cls.AAPL].index, 'UTC')
     cls.AAPL_split_date = Timestamp("2014-06-09", tz='UTC')
     cls.assets = cls.asset_finder.retrieve_all(
         cls.ASSET_FINDER_EQUITY_SIDS)
示例#5
0
    def _maybe_update_symbol_frame(self,
                                   start_time,
                                   api_key,
                                   cache,
                                   symbol,
                                   calendar,
                                   start_session,
                                   end_session,
                                   data_frequency,
                                   retries):

        # Attempt to load pre-existing symbol data from cache.
        key = '{sym}.{freq}.frame'.format(sym=symbol, freq=data_frequency)
        try:
            raw_data = cache[key]
        except KeyError:
            raw_data = None

        # Select the most recent date in cached dataset if it exists,
        # otherwise use the provided `start_session`.
        last = start_session
        if raw_data is not None and len(raw_data) > 0:
            last = safe_tz_localize(raw_data.index[-1], 'UTC')

        should_sleep = False

        # Determine time at which cached data will be considered stale.
        cache_expiration = last + pd.Timedelta(days=2)
        if start_time <= cache_expiration and raw_data is not None:
            # Data is fresh enough to reuse, no need to update. Iterator can
            # proceed to next symbol directly since no API call was required.
            return raw_data, should_sleep

        # If we arrive here, we must have attempted an API call.
        # Setting this flag tells the iterator to pause before starting
        # the next asset, that we don't exceed the data source's rate
        # limit.
        should_sleep = True

        raw_data = self._fetch_symbol_frame(
            api_key,
            symbol,
            calendar,
            start_session,
            end_session,
            data_frequency,
            retries=retries,
        )

        # Cache latest symbol data.
        cache[key] = raw_data

        return raw_data, should_sleep
示例#6
0
def load_frame(url, skiprows):
    """
    Load a DataFrame of data from a Bank of Canada site.
    """
    data = pd.read_csv(
        url,
        skiprows=skiprows,
        skipinitialspace=True,
        na_values=["Bank holiday", "Not available"],
        parse_dates=["Date"],
        index_col="Date",
    ).dropna(how='all')
    return safe_tz_localize(data, 'UTC').rename(columns=COLUMN_NAMES)
示例#7
0
def _load_cached_data(filename,
                      first_date,
                      last_date,
                      now,
                      resource_name,
                      environ=None):
    # Path for the cache.
    path = get_data_filepath(filename, environ)

    # If the path does not exist, it means the first download has not happened
    # yet, so don't try to read from 'path'.
    if os.path.exists(path):
        try:
            data = pd.DataFrame.from_csv(path)
            if data.empty:
                raise ValueError("File is empty.")
            data.index = safe_tz_localize(
                pd.to_datetime(data.index,
                               infer_datetime_format=True,
                               errors='coerce'), 'UTC')
            if has_data_for_dates(data, first_date, last_date):
                return data

            # Don't re-download if we've successfully downloaded and written a
            # file in the last hour.
            last_download_time = last_modified_time(path)
            if (now - last_download_time) <= ONE_HOUR:
                logger.warn(
                    "Refusing to download new {resource} data because a "
                    "download succeeded at {time}.",
                    resource=resource_name,
                    time=last_download_time,
                )
                return data

        except (OSError, IOError, ValueError) as e:
            # These can all be raised by various versions of pandas on various
            # classes of malformed input.  Treat them all as cache misses.
            logger.info(
                "Loading data for {path} failed with error [{error}].",
                path=path,
                error=e,
            )

    logger.info(
        "Cache at {path} does not have data from {start} to {end}.",
        start=first_date,
        end=last_date,
        path=path,
    )
    return None
示例#8
0
    def test_nearest_unequal_elements_short_dts(self, tz):

        # Length 1.
        dts = safe_tz_localize(pd.to_datetime(['2014-01-01']), tz)

        def t(s):
            return None if s is None else pd.Timestamp(s, tz=tz)

        for dt, before, after in (('2013-12-31', None,
                                   '2014-01-01'), ('2014-01-01', None, None),
                                  ('2014-01-02', '2014-01-01', None)):
            computed = nearest_unequal_elements(dts, t(dt))
            expected = (t(before), t(after))
            self.assertEqual(computed, expected)

        # Length 0
        dts = safe_tz_localize(pd.to_datetime([]), tz)
        for dt, before, after in (('2013-12-31', None, None),
                                  ('2014-01-01', None, None), ('2014-01-02',
                                                               None, None)):
            computed = nearest_unequal_elements(dts, t(dt))
            expected = (t(before), t(after))
            self.assertEqual(computed, expected)
示例#9
0
    def _prelude(self, dt, field):
        session = self._trading_calendar.minute_to_session_label(dt)
        dt_value = dt.value
        cache = self._caches[field]
        if cache is None or cache[0] != session:
            market_open = self._market_opens.loc[session]
            cache = self._caches[field] = (session, market_open, {})

        _, market_open, entries = cache
        market_open = safe_tz_localize(market_open, 'UTC')
        if dt != market_open:
            prev_dt = dt_value - self._one_min
        else:
            prev_dt = None
        return market_open, prev_dt, dt_value, entries
示例#10
0
    def open_and_close_for_session(self, session_label):
        """
        Returns a tuple of timestamps of the open and close of the session
        represented by the given label.

        Parameters
        ----------
        session_label: pd.Timestamp
            The session whose open and close are desired.

        Returns
        -------
        (Timestamp, Timestamp)
            The open and close for the given session.
        """
        sched = self.schedule

        # `market_open` and `market_close` should be timezone aware, but pandas
        # 0.16.1 does not appear to support this:
        # http://pandas.pydata.org/pandas-docs/stable/whatsnew.html#datetime-with-tz  # noqa
        return (
            safe_tz_localize(sched.at[session_label, 'market_open'], 'UTC'),
            safe_tz_localize(sched.at[session_label, 'market_close'], 'UTC'),
        )
示例#11
0
def _dt_to_epoch_ns(dt_series):
    """Convert a timeseries into an Int64Index of nanoseconds since the epoch.

    Parameters
    ----------
    dt_series : pd.Series
        The timeseries to convert.

    Returns
    -------
    idx : pd.Int64Index
        The index converted to nanoseconds since the epoch.
    """
    index = pd.to_datetime(dt_series.values)
    index = safe_tz_localize(index, 'UTC')
    return index.view(np.int64)
示例#12
0
def load_bars_from_yahoo(indexes=None,
                         stocks=None,
                         start=None,
                         end=None,
                         adjusted=True):
    """
    Loads data from Yahoo into a panel with the following
    column names for each indicated security:

        - open
        - high
        - low
        - close
        - volume
        - price

    Note that 'price' is Yahoo's 'Adjusted Close', which removes the
    impact of splits and dividends. If the argument 'adjusted' is True, then
    the open, high, low, and close values are adjusted as well.

    :param indexes: Financial indexes to load.
    :type indexes: dict
    :param stocks: Stock closing prices to load.
    :type stocks: list
    :param start: Retrieve prices from start date on.
    :type start: datetime
    :param end: Retrieve prices until end date.
    :type end: datetime
    :param adjusted: Adjust open/high/low/close for splits and dividends.
        The 'price' field is always adjusted.
    :type adjusted: bool

    """
    data = _load_raw_yahoo_data(indexes, stocks, start, end)
    panel = pd.Panel(data)
    # Rename columns
    panel.minor_axis = ['open', 'high', 'low', 'close', 'volume', 'price']
    panel.major_axis = safe_tz_localize(panel.major_axis, pytz.utc)
    # Adjust data
    if adjusted:
        adj_cols = ['open', 'high', 'low', 'close']
        for ticker in panel.items:
            ratio = (panel[ticker]['price'] / panel[ticker]['close'])
            ratio_filtered = ratio.fillna(0).values
            for col in adj_cols:
                panel[ticker][col] *= ratio_filtered
    return panel
示例#13
0
def days_at_time(days, t, tz, day_offset=0):
    """
    Create an index of days at time ``t``, interpreted in timezone ``tz``.

    The returned index is localized to UTC.

    Parameters
    ----------
    days : DatetimeIndex
        An index of dates (represented as midnight).
    t : datetime.time
        The time to apply as an offset to each day in ``days``.
    tz : pytz.timezone
        The timezone to use to interpret ``t``.
    day_offset : int
        The number of days we want to offset @days by

    Examples
    --------
    In the example below, the times switch from 13:45 to 12:45 UTC because
    March 13th is the daylight savings transition for US/Eastern.  All the
    times are still 8:45 when interpreted in US/Eastern.

    >>> import pandas as pd; import datetime; import pprint
    >>> dts = pd.date_range('2016-03-12', '2016-03-14')
    >>> dts_at_845 = days_at_time(dts, datetime.time(8, 45), 'US/Eastern')
    >>> pprint.pprint([str(dt) for dt in dts_at_845])
    ['2016-03-12 13:45:00+00:00',
     '2016-03-13 12:45:00+00:00',
     '2016-03-14 12:45:00+00:00']
    """
    if len(days) == 0:
        return days

    # Offset days without tz to avoid timezone issues.
    days = DatetimeIndex(days).tz_localize(None)
    delta = pd.Timedelta(
        days=day_offset,
        hours=t.hour,
        minutes=t.minute,
        seconds=t.second,
    )
    return safe_tz_localize((days + delta), tz).tz_convert('UTC')
示例#14
0
def get_treasury_data(start_date, end_date):
    data = pd.read_csv(
        "https://www.federalreserve.gov/datadownload/Output.aspx"
        "?rel=H15"
        "&series=bf17364827e38702b42a58cf8eaa3f78"
        "&lastObs="
        "&from="  # An unbounded query is ~2x faster than specifying dates.
        "&to="
        "&filetype=csv"
        "&label=include"
        "&layout=seriescolumn"
        "&type=package",
        skiprows=5,  # First 5 rows are useless headers.
        parse_dates=['Time Period'],
        na_values=['ND'],  # Presumably this stands for "No Data".
        index_col=0,
    ).loc[start_date:end_date].dropna(how='all').rename(
        columns=parse_treasury_csv_column)
    return safe_tz_localize(data,
                            'UTC') * 0.01  # Convert from 2.57% to 0.0257.
示例#15
0
def get_benchmark_returns(symbol, first_date, last_date):
    """
    Get a Series of benchmark returns from Google associated with `symbol`.
    Default is `SPY`.

    Parameters
    ----------
    symbol : str
        Benchmark symbol for which we're getting the returns.
    first_date : pd.Timestamp
        First date for which we want to get data.
    last_date : pd.Timestamp
        Last date for which we want to get data.

    The furthest date that Google goes back to is 1993-02-01. It has missing
    data for 2008-12-15, 2009-08-11, and 2012-02-02, so we add data for the
    dates for which Google is missing data.

    We're also limited to 4000 days worth of data per request. If we make a
    request for data that extends past 4000 trading days, we'll still only
    receive 4000 days of data.

    first_date is **not** included because we need the close from day N - 1 to
    compute the returns for day N.
    """
    if symbol == '^GSPC':
        symbol = 'spy'

    data = pd_reader.DataReader(symbol, 'google', first_date, last_date)

    data = data['Close']

    data[pd.Timestamp('2008-12-15')] = np.nan
    data[pd.Timestamp('2009-08-11')] = np.nan
    data[pd.Timestamp('2012-02-02')] = np.nan

    data = data.fillna(method='ffill')

    return safe_tz_localize(data.sort_index(), 'UTC').pct_change(1).iloc[1:]
示例#16
0
    def _all_minutes_with_interval(self, interval):
        """
        Returns a DatetimeIndex representing all the minutes in this calendar.
        """
        opens_in_ns = \
            self._opens.values.astype('datetime64[ns]')

        closes_in_ns = \
            self._closes.values.astype('datetime64[ns]')

        deltas = closes_in_ns - opens_in_ns

        nanos_in_interval = interval * NANOS_IN_MINUTE

        # + 1 because we want 390 days per standard day, not 389
        daily_sizes = (deltas / nanos_in_interval) + 1
        num_minutes = np.sum(daily_sizes).astype(np.int64)

        # One allocation for the entire thing. This assumes that each day
        # represents a contiguous block of minutes.
        all_minutes = np.empty(num_minutes, dtype='datetime64[ns]')

        idx = 0
        for day_idx, size in enumerate(daily_sizes):
            # lots of small allocations, but it's fast enough for now.

            # size is a np.timedelta64, so we need to int it
            size_int = int(size)
            all_minutes[idx:(idx + size_int)] = \
                np.arange(
                    opens_in_ns[day_idx],
                    closes_in_ns[day_idx] + NANOS_IN_MINUTE,
                    nanos_in_interval
                )

            idx += size_int

        return safe_tz_localize(DatetimeIndex(all_minutes), "UTC")
示例#17
0
class EventLoaderUtilsTestCase(ZiplineTestCase):
    # These cases test the following:
    # 1. Shuffling timestamps in DST/EST produces the correct normalized
    # timestamps
    # 2. Timestamps at query time boundaries are normalized correctly
    boundary_dates = [pd.Timestamp('2013-01-04 8:44:59'),
                      pd.Timestamp('2013-01-04 8:45:00'),
                      pd.Timestamp('2013-01-04 8:46:00')]
    us_boundary_dates = [safe_tz_localize(date, 'US/Eastern') for date in
                         boundary_dates]
    moscow_boundary_dates = [safe_tz_localize(date, 'Europe/Moscow') for date in
                             boundary_dates]
    mixed_tz_dates = [pd.Timestamp('2013-12-30'),
                      pd.Timestamp('2013-01-24'),
                      pd.Timestamp('2013-01-31 20:00:00'),
                      pd.Timestamp('2013-04-04'),
                      pd.Timestamp('2013-04-21'),
                      pd.Timestamp('2013-06-01')]
    us_dates = pd.to_datetime(us_boundary_dates + mixed_tz_dates,
                              utc=True).tz_localize(None)
    moscow_dates = pd.to_datetime(moscow_boundary_dates + mixed_tz_dates,
                                  utc=True).tz_localize(None)

    all_combos = list(map(np.array, itertools.permutations(np.arange(len(
        boundary_dates + mixed_tz_dates)
    ))))
    # len(permutations(7)) is about 5000, which makes this take too long.
    # Sampling down to 50-ish permutations still gives is good coverage of the
    # different interleavings.
    combos = all_combos[::100]

    expected_us = pd.Series(
        [pd.Timestamp('2013-01-04'),
         pd.Timestamp('2013-01-05'),
         pd.Timestamp('2013-01-05'),
         pd.Timestamp('2013-12-30'),
         pd.Timestamp('2013-01-24'),
         pd.Timestamp('2013-02-01'),
         pd.Timestamp('2013-04-04'),
         pd.Timestamp('2013-04-21'),
         pd.Timestamp('2013-06-01')]
    ).values

    # Russia's TZ offset is +4
    expected_russia = pd.Series(
        [pd.Timestamp('2013-01-04'),
         pd.Timestamp('2013-01-05'),
         pd.Timestamp('2013-01-05'),
         pd.Timestamp('2013-12-30'),
         pd.Timestamp('2013-01-24'),
         pd.Timestamp('2013-01-31'),
         pd.Timestamp('2013-04-04'),
         pd.Timestamp('2013-04-21'),
         pd.Timestamp('2013-06-01')]
    ).values

    # Test with timezones on either side of the meridian
    @parameterized.expand([(expected_us, 'US/Eastern', us_dates),
                           (expected_russia, 'Europe/Moscow', moscow_dates)])
    def test_normalize_to_query_time(self, expected, tz, dates):
        # Order matters in pandas 0.18.2. Prior to that, using tz_convert on
        # a DatetimeIndex with DST/EST timestamps mixed resulted in some of
        # them being an hour off (1 hour past midnight).
        for scrambler in self.combos:
            df = pd.DataFrame({"timestamp": dates[scrambler]})
            result = normalize_timestamp_to_query_time(df,
                                                       time(8, 45),
                                                       tz,
                                                       inplace=False,
                                                       ts_field='timestamp')

            timestamps = result['timestamp'].values
            check_arrays(np.sort(timestamps), np.sort(expected[scrambler]))
示例#18
0
 def session_close(self, session_label):
     return safe_tz_localize(
         self.schedule.at[session_label, 'market_close'], 'UTC')
示例#19
0
    def ingest_csv(self,
                   path,
                   data_frequency,
                   empty_rows_behavior='strip',
                   duplicates_threshold=100):
        """
        Ingest price data from a CSV file.

        Parameters
        ----------
        path: str
        data_frequency: str

        Returns
        -------
        list[str]
            A list of potential problems detected during ingestion.

        """
        log.info('ingesting csv file: {}'.format(path))

        if self.exchange is None:
            # Avoid circular dependencies
            from catalyst.exchange.utils.factory import get_exchange
            self.exchange = get_exchange(self.exchange_name)

        problems = []
        df = pd.read_csv(path,
                         header=0,
                         sep=',',
                         dtype=dict(symbol=np.object_,
                                    last_traded=np.object_,
                                    open=np.float64,
                                    high=np.float64,
                                    low=np.float64,
                                    close=np.float64,
                                    volume=np.float64),
                         parse_dates=['last_traded'],
                         index_col=None)
        min_start_dt = None
        max_end_dt = None

        symbols = df['symbol'].unique()

        # Apply the timezone before creating an index for simplicity
        df['last_traded'] = safe_tz_localize(df['last_traded'].dt, pytz.UTC)
        df.set_index(['symbol', 'last_traded'], drop=True, inplace=True)

        assets = dict()
        for symbol in symbols:
            start_dt = df.index.get_level_values(1).min()
            end_dt = df.index.get_level_values(1).max()
            end_dt_key = 'end_{}'.format(data_frequency)

            market = self.exchange.get_market(symbol)
            if market is None:
                raise ValueError('symbol not available in the exchange.')

            params = dict(
                exchange=self.exchange.name,
                data_source='local',
                exchange_symbol=market['id'],
            )
            mixin_market_params(self.exchange_name, params, market)

            asset_def = self.exchange.get_asset_def(market, True)
            if asset_def is not None:
                params['symbol'] = asset_def['symbol']

                params['start_date'] = asset_def['start_date'] \
                    if asset_def['start_date'] < start_dt else start_dt

                params['end_date'] = asset_def[end_dt_key] \
                    if asset_def[end_dt_key] > end_dt else end_dt

                params['end_daily'] = end_dt \
                    if data_frequency == 'daily' else asset_def['end_daily']

                params['end_minute'] = end_dt \
                    if data_frequency == 'minute' else asset_def['end_minute']

            else:
                params['symbol'] = get_catalyst_symbol(market)

                params['end_daily'] = end_dt \
                    if data_frequency == 'daily' else 'N/A'
                params['end_minute'] = end_dt \
                    if data_frequency == 'minute' else 'N/A'

            if min_start_dt is None or start_dt < min_start_dt:
                min_start_dt = start_dt

            if max_end_dt is None or end_dt > max_end_dt:
                max_end_dt = end_dt

            asset = TradingPair(**params)
            assets[market['id']] = asset

        save_exchange_symbols(self.exchange_name, assets, True)

        writer = self.get_writer(start_dt=min_start_dt.replace(hour=00,
                                                               minute=00),
                                 end_dt=max_end_dt.replace(hour=23, minute=59),
                                 data_frequency=data_frequency)

        for symbol in assets:
            # here the symbol is the market['id']
            asset = assets[symbol]
            ohlcv_df = df.loc[(df.index.get_level_values(0) == asset.symbol
                               )]  # type: pd.DataFrame
            ohlcv_df.index = ohlcv_df.index.droplevel(0)

            period_start = start_dt.replace(hour=00, minute=00)
            period_end = end_dt.replace(hour=23, minute=59)
            periods = self.get_calendar_periods_range(period_start, period_end,
                                                      data_frequency)

            # We're not really resampling but ensuring that each frame
            # contains data
            ohlcv_df = ohlcv_df.reindex(periods, method='ffill')
            ohlcv_df['volume'] = ohlcv_df['volume'].fillna(0)

            problems += self.ingest_df(
                ohlcv_df=ohlcv_df,
                data_frequency=data_frequency,
                asset=asset,
                writer=writer,
                empty_rows_behavior=empty_rows_behavior,
                duplicates_threshold=duplicates_threshold)
        return filter(partial(is_not, None), problems)
示例#20
0
def get_trading_days(start, end, trading_day=trading_day):
    return safe_tz_localize(pd.date_range(start=start.date(),
                                          end=end.date(),
                                          freq=trading_day), 'UTC')