示例#1
0
    def get_history_window_series_and_load(self, assets, end_dt, bar_count,
                                           field, data_frequency):
        try:
            series = self.get_history_window_series(
                assets=assets,
                end_dt=end_dt,
                bar_count=bar_count,
                field=field,
                data_frequency=data_frequency)
            return pd.DataFrame(series)

        except PricingDataNotLoadedError:
            start_dt = get_start_dt(end_dt, bar_count, data_frequency)
            log.info('pricing data for {symbol} not found in range '
                     '{start} to {end}, updating the bundles.'.format(
                         symbol=[asset.symbol for asset in assets],
                         start=start_dt,
                         end=end_dt))
            self.ingest_assets(assets=assets,
                               start_dt=start_dt,
                               end_dt=end_dt,
                               data_frequency=data_frequency,
                               show_progress=True)
            series = self.get_history_window_series(
                assets=assets,
                end_dt=end_dt,
                bar_count=bar_count,
                field=field,
                data_frequency=data_frequency,
                reset_reader=True)
            return series
示例#2
0
    def test_ingest_candles(self):
        exchange_name = 'bitfinex'
        data_frequency = 'minute'

        exchange = get_exchange(exchange_name)
        bundle = ExchangeBundle(exchange)
        assets = [exchange.get_asset('iot_btc')]

        end_dt = pd.to_datetime('2017-10-20', utc=True)
        bar_count = 100

        start_dt = get_start_dt(end_dt, bar_count, data_frequency)
        candles = exchange.get_candles(
            assets=assets,
            start_dt=start_dt,
            end_dt=end_dt,
            bar_count=bar_count,
            freq='1T'
        )

        writer = bundle.get_writer(start_dt, end_dt, data_frequency)
        for asset in assets:
            dates = [candle['last_traded'] for candle in candles[asset]]

            values = dict()
            for field in ['open', 'high', 'low', 'close', 'volume']:
                values[field] = [candle[field] for candle in candles[asset]]

            periods = bundle.get_calendar_periods_range(
                start_dt, end_dt, data_frequency
            )
            df = pd.DataFrame(values, index=dates)
            df = df.loc[periods].fillna(method='ffill')

            # TODO: why do I get an extra bar?
            bundle.ingest_df(
                ohlcv_df=df,
                data_frequency=data_frequency,
                asset=asset,
                writer=writer,
                empty_rows_behavior='raise',
                duplicates_behavior='raise'
            )

        bundle_series = bundle.get_history_window_series(
            assets=assets,
            end_dt=end_dt,
            bar_count=bar_count,
            field='close',
            data_frequency=data_frequency,
            reset_reader=True
        )
        df = pd.DataFrame(bundle_series)
        print('\n' + df_to_string(df))
        pass
示例#3
0
    def test_validate_data(self):
        exchange_name = 'bitfinex'
        data_frequency = 'minute'

        exchange = get_exchange(exchange_name)
        exchange_bundle = ExchangeBundle(exchange)
        assets = [exchange.get_asset('iot_btc')]

        end_dt = pd.to_datetime('2017-9-2 1:00', utc=True)
        bar_count = 60

        bundle_series = exchange_bundle.get_history_window_series(
            assets=assets,
            end_dt=end_dt,
            bar_count=bar_count * 5,
            field='close',
            data_frequency='minute',
        )
        candles = exchange.get_candles(
            assets=assets,
            end_dt=end_dt,
            bar_count=bar_count,
            data_frequency='minute'
        )
        start_dt = get_start_dt(end_dt, bar_count, data_frequency)

        frames = []
        for asset in assets:
            bundle_df = pd.DataFrame(
                data=dict(bundle_price=bundle_series[asset]),
                index=bundle_series[asset].index
            )
            exchange_series = exchange.get_series_from_candles(
                candles=candles[asset],
                start_dt=start_dt,
                end_dt=end_dt,
                data_frequency=data_frequency,
                field='close'
            )
            exchange_df = pd.DataFrame(
                data=dict(exchange_price=exchange_series),
                index=exchange_series.index
            )

            df = exchange_df.join(bundle_df, how='left')
            df['last_traded'] = df.index
            df['asset'] = asset.symbol
            df.set_index(['asset', 'last_traded'], inplace=True)

            frames.append(df)

        df = pd.concat(frames)
        print('\n' + df_to_string(df))
        pass
示例#4
0
    def get_history_window_series_and_load(self,
                                           assets,
                                           end_dt,
                                           bar_count,
                                           field,
                                           data_frequency,
                                           algo_end_dt=None):
        """
        Retrieve price data history, ingest missing data.

        Parameters
        ----------
        assets: list[TradingPair]
        end_dt: datetime
        bar_count: int
        field: str
        data_frequency: str
        algo_end_dt: datetime

        Returns
        -------
        Series

        """
        try:
            series = self.get_history_window_series(
                assets=assets,
                end_dt=end_dt,
                bar_count=bar_count,
                field=field,
                data_frequency=data_frequency)
            return pd.DataFrame(series)

        except PricingDataNotLoadedError:
            start_dt = get_start_dt(end_dt, bar_count, data_frequency)
            log.info('pricing data for {symbol} not found in range '
                     '{start} to {end}, updating the bundles.'.format(
                         symbol=[asset.symbol for asset in assets],
                         start=start_dt,
                         end=end_dt))
            self.ingest_assets(assets=assets,
                               start_dt=start_dt,
                               end_dt=algo_end_dt,
                               data_frequency=data_frequency,
                               show_progress=True,
                               asset_chunks=True)
            series = self.get_history_window_series(
                assets=assets,
                end_dt=end_dt,
                bar_count=bar_count,
                field=field,
                data_frequency=data_frequency,
                reset_reader=False)
            return series
示例#5
0
    def get_history_window_series(self,
                                  assets,
                                  end_dt,
                                  bar_count,
                                  field,
                                  data_frequency,
                                  trailing_bar_count=None,
                                  reset_reader=False):
        start_dt = get_start_dt(end_dt, bar_count, data_frequency, False)
        start_dt, _ = self.get_adj_dates(start_dt, end_dt, assets,
                                         data_frequency)

        if trailing_bar_count:
            delta = get_delta(trailing_bar_count, data_frequency)
            end_dt += delta

        # This is an attempt to resolve some caching with the reader
        # when auto-ingesting data.
        # TODO: needs more work
        reader = self.get_reader(data_frequency)
        if reset_reader:
            del self._readers[reader._rootdir]
            reader = self.get_reader(data_frequency)

        if reader is None:
            symbols = [asset.symbol for asset in assets]
            raise PricingDataNotLoadedError(
                field=field,
                first_trading_day=min([asset.start_date for asset in assets]),
                exchange=self.exchange_name,
                symbols=symbols,
                symbol_list=','.join(symbols),
                data_frequency=data_frequency,
                start_dt=start_dt,
                end_dt=end_dt)

        series = dict()
        for asset in assets:
            asset_start_dt, _ = self.get_adj_dates(start_dt, end_dt, assets,
                                                   data_frequency)
            in_bundle = range_in_bundle(asset, asset_start_dt, end_dt, reader)
            if not in_bundle:
                raise PricingDataNotLoadedError(
                    field=field,
                    first_trading_day=asset.start_date,
                    exchange=self.exchange_name,
                    symbols=asset.symbol,
                    symbol_list=asset.symbol,
                    data_frequency=data_frequency,
                    start_dt=asset_start_dt,
                    end_dt=end_dt)

            periods = self.get_calendar_periods_range(asset_start_dt, end_dt,
                                                      data_frequency)
            # This does not behave well when requesting multiple assets
            # when the start or end date of one asset is outside of the range
            # looking at the logic in load_raw_arrays(), we are not achieving
            # any performance gain by requesting multiple sids at once. It's
            # looping through the sids and making separate requests anyway.
            arrays = reader.load_raw_arrays(sids=[asset.sid],
                                            fields=[field],
                                            start_dt=start_dt,
                                            end_dt=end_dt)
            if len(arrays) == 0:
                raise DataCorruptionError(exchange=self.exchange_name,
                                          symbols=asset.symbol,
                                          start_dt=asset_start_dt,
                                          end_dt=end_dt)

            field_values = arrays[0][:, 0]

            try:
                value_series = pd.Series(field_values, index=periods)
                series[asset] = value_series
            except ValueError as e:
                raise PricingDataValueError(exchange=asset.exchange,
                                            symbol=asset.symbol,
                                            start_dt=asset_start_dt,
                                            end_dt=end_dt,
                                            error=e)

        return series
示例#6
0
    def get_history_window_series_and_load(self,
                                           assets,
                                           end_dt,
                                           bar_count,
                                           field,
                                           data_frequency,
                                           algo_end_dt=None,
                                           trailing_bar_count=None,
                                           force_auto_ingest=False):
        """
        Retrieve price data history, ingest missing data.

        Parameters
        ----------
        assets: list[TradingPair]
        end_dt: pd.Timestamp
        bar_count: int
        field: str
        data_frequency: str
        algo_end_dt: pd.Timestamp

        Returns
        -------
        Series

        """
        if AUTO_INGEST or force_auto_ingest:
            try:
                series = self.get_history_window_series(
                    assets=assets,
                    end_dt=end_dt,
                    bar_count=bar_count,
                    field=field,
                    data_frequency=data_frequency,
                    trailing_bar_count=trailing_bar_count,
                )
                return pd.DataFrame(series)

            except PricingDataNotLoadedError:
                start_dt = get_start_dt(end_dt, bar_count, data_frequency)
                log.info('pricing data for {symbol} not found in range '
                         '{start} to {end}, updating the bundles.'.format(
                             symbol=[asset.symbol for asset in assets],
                             start=start_dt,
                             end=end_dt))
                self.ingest_assets(
                    assets=assets,
                    start_dt=start_dt,
                    end_dt=algo_end_dt,  # TODO: apply trailing bars
                    data_frequency=data_frequency,
                    show_progress=True,
                    show_breakdown=True)
                series = self.get_history_window_series(
                    assets=assets,
                    end_dt=end_dt,
                    bar_count=bar_count,
                    field=field,
                    data_frequency=data_frequency,
                    reset_reader=True,
                    trailing_bar_count=trailing_bar_count,
                )
                return series

        else:
            series = self.get_history_window_series(
                assets=assets,
                end_dt=end_dt,
                bar_count=bar_count,
                field=field,
                data_frequency=data_frequency,
                trailing_bar_count=trailing_bar_count,
            )
            return pd.DataFrame(series)
示例#7
0
    def get_history_window(self,
                           assets,
                           end_dt,
                           bar_count,
                           frequency,
                           field,
                           data_frequency=None,
                           ffill=True):
        """
        Public API method that returns a dataframe containing the requested
        history window.  Data is fully adjusted.

        Parameters
        ----------
        assets : list of catalyst.data.Asset objects
            The assets whose data is desired.

        end_dt: not applicable to cryptocurrencies

        bar_count: int
            The number of bars desired.

        frequency: string
            "1d" or "1m"

        field: string
            The desired field of the asset.

        data_frequency: string
            The frequency of the data to query; i.e. whether the data is
            'daily' or 'minute' bars.

        # TODO: fill how?
        ffill: boolean
            Forward-fill missing values. Only has effect if field
            is 'price'.

        Returns
        -------
        A dataframe containing the requested data.
        """

        freq_match = re.match(r'([0-9].*)(m|M|d|D)', frequency, re.M | re.I)
        if freq_match:
            candle_size = int(freq_match.group(1))
            unit = freq_match.group(2)

        else:
            raise InvalidHistoryFrequencyError(frequency)

        if unit.lower() == 'd':
            if data_frequency == 'minute':
                data_frequency = 'daily'

        elif unit.lower() == 'm':
            if data_frequency == 'daily':
                data_frequency = 'minute'

        else:
            raise InvalidHistoryFrequencyError(frequency)

        adj_bar_count = candle_size * bar_count
        try:
            series = self.bundle.get_history_window_series_and_load(
                assets=assets,
                end_dt=end_dt,
                bar_count=adj_bar_count,
                field=field,
                data_frequency=data_frequency)
        except PricingDataNotLoadedError:
            series = dict()

        for asset in assets:
            if asset not in series or series[asset].index[-1] < end_dt:
                # Adding bars too recent to be contained in the consolidated
                # exchanges bundles. We go directly against the exchange
                # to retrieve the candles.
                start_dt = get_start_dt(end_dt, adj_bar_count, data_frequency)
                trailing_dt = \
                    series[asset].index[-1] + get_delta(1, data_frequency) \
                        if asset in series else start_dt

                trailing_bar_count = \
                    get_periods(trailing_dt, end_dt, data_frequency)

                # The get_history method supports multiple asset
                candles = self.get_candles(data_frequency=data_frequency,
                                           assets=asset,
                                           bar_count=trailing_bar_count,
                                           end_dt=end_dt)

                last_value = series[asset].iloc(0) if asset in series \
                    else np.nan

                candle_series = self.get_series_from_candles(
                    candles=candles,
                    start_dt=trailing_dt,
                    end_dt=end_dt,
                    field=field,
                    previous_value=last_value)

                if asset in series:
                    series[asset].append(candle_series)

                else:
                    series[asset] = candle_series

        df = pd.DataFrame(series)

        if candle_size > 1:
            if field == 'open':
                agg = 'first'
            elif field == 'high':
                agg = 'max'
            elif field == 'low':
                agg = 'min'
            elif field == 'close':
                agg = 'last'
            elif field == 'volume':
                agg = 'sum'
            else:
                raise ValueError('Invalid field.')

            df = df.resample('{}T'.format(candle_size)).agg(agg)

        return df
示例#8
0
    def get_history_window_series(self,
                                  assets,
                                  end_dt,
                                  bar_count,
                                  field,
                                  data_frequency,
                                  reset_reader=False):
        start_dt = get_start_dt(end_dt, bar_count, data_frequency, False)
        start_dt, end_dt = self.get_adj_dates(start_dt, end_dt, assets,
                                              data_frequency)

        reader = self.get_reader(data_frequency)
        if reset_reader:
            del self._readers[reader._rootdir]
            reader = self.get_reader(data_frequency)

        if reader is None:
            symbols = [asset.symbol for asset in assets]
            raise PricingDataNotLoadedError(
                field=field,
                first_trading_day=min([asset.start_date for asset in assets]),
                exchange=self.exchange.name,
                symbols=symbols,
                symbol_list=','.join(symbols),
                data_frequency=data_frequency,
                start_dt=start_dt,
                end_dt=end_dt)

        for asset in assets:
            asset_start_dt, asset_end_dt = self.get_adj_dates(
                start_dt, end_dt, assets, data_frequency)

            in_bundle = range_in_bundle(asset, asset_start_dt, asset_end_dt,
                                        reader)
            if not in_bundle:
                raise PricingDataNotLoadedError(
                    field=field,
                    first_trading_day=asset.start_date,
                    exchange=self.exchange.name,
                    symbols=asset.symbol,
                    symbol_list=asset.symbol,
                    data_frequency=data_frequency,
                    start_dt=asset_start_dt,
                    end_dt=asset_end_dt)

        series = dict()
        try:
            arrays = reader.load_raw_arrays(
                sids=[asset.sid for asset in assets],
                fields=[field],
                start_dt=start_dt,
                end_dt=end_dt)

        except Exception:
            symbols = [asset.symbol.encode('utf-8') for asset in assets]
            raise PricingDataNotLoadedError(
                field=field,
                first_trading_day=min([asset.start_date for asset in assets]),
                exchange=self.exchange.name,
                symbols=symbols,
                symbol_list=','.join(symbols),
                data_frequency=data_frequency,
                start_dt=start_dt,
                end_dt=end_dt)

        periods = self.get_calendar_periods_range(start_dt, end_dt,
                                                  data_frequency)

        for asset_index, asset in enumerate(assets):
            asset_values = arrays[asset_index]

            value_series = pd.Series(asset_values.flatten(), index=periods)
            series[asset] = value_series

        return series
示例#9
0
    def get_history_window_with_bundle(self,
                                       assets,
                                       end_dt,
                                       bar_count,
                                       frequency,
                                       field,
                                       data_frequency=None,
                                       ffill=True,
                                       force_auto_ingest=False):
        """
        Public API method that returns a dataframe containing the requested
        history window.  Data is fully adjusted.

        Parameters
        ----------
        assets : list[TradingPair]
            The assets whose data is desired.

        end_dt: datetime
            The date of the last bar.

        bar_count: int
            The number of bars desired.

        frequency: string
            "1d" or "1m"

        field: string
            The desired field of the asset.

        data_frequency: string
            The frequency of the data to query; i.e. whether the data is
            'daily' or 'minute' bars.

        # TODO: fill how?
        ffill: boolean
            Forward-fill missing values. Only has effect if field
            is 'price'.

        Returns
        -------
        DataFrame
            A dataframe containing the requested data.

        """
        freq, candle_size, unit, data_frequency = get_frequency(
            frequency, data_frequency)
        adj_bar_count = candle_size * bar_count

        try:
            series = self.bundle.get_history_window_series_and_load(
                assets=assets,
                end_dt=end_dt,
                bar_count=adj_bar_count,
                field=field,
                data_frequency=data_frequency,
                force_auto_ingest=force_auto_ingest)

        except (PricingDataNotLoadedError, NoDataAvailableOnExchange):
            series = dict()

        for asset in assets:
            if asset not in series or series[asset].index[-1] < end_dt:
                # Adding bars too recent to be contained in the consolidated
                # exchanges bundles. We go directly against the exchange
                # to retrieve the candles.
                start_dt = get_start_dt(end_dt, adj_bar_count, data_frequency)
                trailing_dt = \
                    series[asset].index[-1] + get_delta(1, data_frequency) \
                        if asset in series else start_dt

                # The get_history method supports multiple asset
                # Use the original frequency to let each api optimize
                # the size of result sets
                trailing_bar_count = get_periods(trailing_dt, end_dt, freq)
                candles = self.get_candles(freq=freq,
                                           assets=asset,
                                           bar_count=trailing_bar_count,
                                           start_dt=start_dt,
                                           end_dt=end_dt)

                last_value = series[asset].iloc(0) if asset in series \
                    else np.nan

                # Create a series with the common data_frequency, ffill
                # missing values
                candle_series = self.get_series_from_candles(
                    candles=candles,
                    start_dt=trailing_dt,
                    end_dt=end_dt,
                    data_frequency=data_frequency,
                    field=field,
                    previous_value=last_value)

                if asset in series:
                    series[asset].append(candle_series)

                else:
                    series[asset] = candle_series

        df = resample_history_df(pd.DataFrame(series), freq, field)
        # TODO: consider this more carefully
        df.dropna(inplace=True)

        return df
示例#10
0
    def get_history_window(self,
                           assets,
                           end_dt,
                           bar_count,
                           frequency,
                           field,
                           data_frequency=None,
                           is_current=False):
        """
        Public API method that returns a dataframe containing the requested
        history window.  Data is fully adjusted.

        Parameters
        ----------
        assets : list[TradingPair]
            The assets whose data is desired.

        end_dt: datetime
            The date of the last bar

        bar_count: int
            The number of bars desired.

        frequency: string
            "1d" or "1m"

        field: string
            The desired field of the asset.

        data_frequency: string
            The frequency of the data to query; i.e. whether the data is
            'daily' or 'minute' bars.

        is_current: bool
            Skip date filters when current data is requested (last few bars
            until now).

        Notes
        -----
        Catalysts requires an end data with bar count both CCXT wants a
        start data with bar count. Since we have to make calculations here,
        we ensure that the last candle match the end_dt parameter.

        Returns
        -------
        DataFrame
            A dataframe containing the requested data.

        """
        freq, candle_size, unit, data_frequency = get_frequency(
            frequency, data_frequency)
        adj_bar_count = candle_size * bar_count

        start_dt = get_start_dt(end_dt, adj_bar_count, data_frequency)

        # The get_history method supports multiple asset
        candles = self.get_candles(
            freq=freq,
            assets=assets,
            bar_count=bar_count,
            start_dt=start_dt if not is_current else None,
            end_dt=end_dt if not is_current else None,
        )

        series = dict()
        for asset in candles:
            asset_series = self.get_series_from_candles(
                candles=candles[asset],
                start_dt=start_dt,
                end_dt=end_dt,
                data_frequency=frequency,
                field=field,
            )
            if end_dt is not None:
                delta = get_delta(candle_size, data_frequency)
                adj_end_dt = end_dt - delta
                last_traded = asset_series.index[-1]

                if last_traded < adj_end_dt:
                    raise LastCandleTooEarlyError(
                        last_traded=last_traded,
                        end_dt=adj_end_dt,
                        exchange=self.name,
                    )
            series[asset] = asset_series

        df = pd.DataFrame(series)
        df.dropna(inplace=True)

        return df
示例#11
0
    def get_history_window(self,
                           assets,
                           end_dt,
                           bar_count,
                           frequency,
                           field,
                           data_frequency=None,
                           ffill=True):
        """
        Public API method that returns a dataframe containing the requested
        history window.  Data is fully adjusted.

        Parameters
        ----------
        assets : list[TradingPair]
            The assets whose data is desired.

        end_dt: datetime
            The date of the last bar

        bar_count: int
            The number of bars desired.

        frequency: string
            "1d" or "1m"

        field: string
            The desired field of the asset.

        data_frequency: string
            The frequency of the data to query; i.e. whether the data is
            'daily' or 'minute' bars.

        # TODO: fill how?
        ffill: boolean
            Forward-fill missing values. Only has effect if field
            is 'price'.

        Returns
        -------
        DataFrame
            A dataframe containing the requested data.

        """
        freq, candle_size, unit, data_frequency = get_frequency(
            frequency, data_frequency)
        adj_bar_count = candle_size * bar_count
        start_dt = get_start_dt(end_dt, adj_bar_count, data_frequency)

        # The get_history method supports multiple asset
        candles = self.get_candles(freq=freq,
                                   assets=assets,
                                   bar_count=bar_count,
                                   start_dt=start_dt,
                                   end_dt=end_dt)

        series = dict()
        for asset in candles:
            asset_series = self.get_series_from_candles(
                candles=candles[asset],
                start_dt=start_dt,
                end_dt=end_dt,
                data_frequency=frequency,
                field=field,
            )
            series[asset] = asset_series

        df = pd.DataFrame(series)
        df.dropna(inplace=True)

        return df