Exemplo n.º 1
0
    def bcolz_exchange_daily_write_read(self, exchange_name):
        start = pd.to_datetime('2017-10-01 00:00')
        end = pd.to_datetime('today')
        freq = 'daily'

        bundle = ExchangeBundle(exchange_name)

        df = self.generate_df(exchange_name, freq, start, end)

        print(df.index[0], df.index[-1])

        writer = BcolzExchangeBarWriter(rootdir=self.root_dir,
                                        start_session=df.index[0],
                                        end_session=df.index[-1],
                                        data_frequency=freq,
                                        write_metadata=True)

        data = []
        data.append((1, df))
        writer.write(data)

        reader = BcolzExchangeBarReader(rootdir=self.root_dir,
                                        data_frequency=freq)

        arrays = reader.load_raw_arrays(self.columns, start, end, [
            1,
        ])

        periods = bundle.get_calendar_periods_range(start, end, freq)

        dx = get_df_from_arrays(arrays, periods)

        assert_equals(df.equals(dx), True)
        pass
Exemplo n.º 2
0
    def chunk_to_df(self, exchange_name, symbol, data_frequency, period):

        exchange = get_exchange(exchange_name)
        asset = exchange.get_asset(symbol)

        filename = get_bcolz_chunk(exchange_name=exchange_name,
                                   symbol=symbol,
                                   data_frequency=data_frequency,
                                   period=period)

        reader = BcolzExchangeBarReader(rootdir=filename,
                                        data_frequency=data_frequency)

        # metadata = BcolzMinuteBarMetadata.read(filename)

        start = reader.first_trading_day
        end = reader.last_available_dt

        if data_frequency == 'daily':
            end = end - pd.Timedelta(hours=23, minutes=59)

        print(start, end, data_frequency)

        arrays = reader.load_raw_arrays(self.columns, start, end, [
            asset.sid,
        ])

        bundle = ExchangeBundle(exchange_name)

        periods = bundle.get_calendar_periods_range(start, end, data_frequency)

        return get_df_from_arrays(arrays, periods)
Exemplo n.º 3
0
    def _bundle_to_csv(self,
                       asset,
                       exchange_name,
                       data_frequency,
                       filename,
                       path=None,
                       start_dt=None,
                       end_dt=None):
        bundle = ExchangeBundle(exchange_name)
        reader = bundle.get_reader(data_frequency, path=path)

        if start_dt is None:
            start_dt = reader.first_trading_day

        if end_dt is None:
            end_dt = reader.last_available_dt

        if data_frequency == 'daily':
            end_dt = end_dt - pd.Timedelta(hours=23, minutes=59)

        arrays = None
        try:
            arrays = reader.load_raw_arrays(
                sids=[asset.sid],
                fields=['open', 'high', 'low', 'close', 'volume'],
                start_dt=start_dt,
                end_dt=end_dt)
        except Exception as e:
            log.warn('skipping ctable for {} from {} to {}: {}'.format(
                asset.symbol, start_dt, end_dt, e))

        periods = bundle.get_calendar_periods_range(start_dt, end_dt,
                                                    data_frequency)
        df = get_df_from_arrays(arrays, periods)

        folder = os.path.join(tempfile.gettempdir(), 'catalyst', exchange_name,
                              asset.symbol)
        ensure_directory(folder)

        path = os.path.join(folder, filename + '.csv')

        log.info('creating csv file: {}'.format(path))
        print('HEAD\n{}'.format(df.head(100)))
        print('TAIL\n{}'.format(df.tail(100)))
        df.to_csv(path)
        pass
Exemplo n.º 4
0
    def ingest_ctable(self,
                      asset,
                      data_frequency,
                      period,
                      writer,
                      empty_rows_behavior='strip',
                      duplicates_threshold=100,
                      cleanup=False):
        """
        Merge a ctable bundle chunk into the main bundle for the exchange.

        Parameters
        ----------
        asset: TradingPair
        data_frequency: str
        period: str
        writer:
        empty_rows_behavior: str
            Ensure that the bundle does not have any missing data.

        cleanup: bool
            Remove the temp bundle directory after ingestion.

        Returns
        -------
        list[str]
            A list of problems which occurred during ingestion.

        """
        problems = []

        # Download and extract the bundle
        path = get_bcolz_chunk(exchange_name=self.exchange_name,
                               symbol=asset.symbol,
                               data_frequency=data_frequency,
                               period=period)

        reader = self.get_reader(data_frequency, path=path)
        if reader is None:
            try:
                log.warn('the reader is unable to use bundle: {}, '
                         'deleting it.'.format(path))
                shutil.rmtree(path)

            except Exception as e:
                log.warn('unable to remove temp bundle: {}'.format(e))

            raise TempBundleNotFoundError(path=path)

        start_dt = reader.first_trading_day
        end_dt = reader.last_available_dt

        if data_frequency == 'daily':
            end_dt = end_dt - pd.Timedelta(hours=23, minutes=59)

        arrays = None
        try:
            arrays = reader.load_raw_arrays(
                sids=[asset.sid],
                fields=['open', 'high', 'low', 'close', 'volume'],
                start_dt=start_dt,
                end_dt=end_dt)
        except Exception as e:
            log.warn('skipping ctable for {} from {} to {}: {}'.format(
                asset.symbol, start_dt, end_dt, e))

        if not arrays:
            return reader._rootdir

        periods = self.get_calendar_periods_range(start_dt, end_dt,
                                                  data_frequency)
        df = get_df_from_arrays(arrays, periods)
        problems += self.ingest_df(ohlcv_df=df,
                                   data_frequency=data_frequency,
                                   asset=asset,
                                   writer=writer,
                                   empty_rows_behavior=empty_rows_behavior,
                                   duplicates_threshold=duplicates_threshold)

        if cleanup:
            log.debug('removing bundle folder following ingestion: {}'.format(
                reader._rootdir))
            shutil.rmtree(reader._rootdir)

        return filter(partial(is_not, None), problems)
Exemplo n.º 5
0
    def ingest_ctable(self,
                      asset,
                      data_frequency,
                      period,
                      start_dt,
                      end_dt,
                      writer,
                      empty_rows_behavior='strip',
                      cleanup=False):
        """
        Merge a ctable bundle chunk into the main bundle for the exchange.

        :param asset: TradingPair
        :param data_frequency: str
        :param period: str
        :param writer:
        :param empty_rows_behavior: str
            Ensure that the bundle does not have any missing data.

        :param cleanup: bool
            Remove the temp bundle directory after ingestion.

        :return:
        """

        path = get_bcolz_chunk(exchange_name=self.exchange.name,
                               symbol=asset.symbol,
                               data_frequency=data_frequency,
                               period=period)

        reader = self.get_reader(data_frequency, path=path)
        if reader is None:
            raise TempBundleNotFoundError(path=path)

        arrays = reader.load_raw_arrays(
            sids=[asset.sid],
            fields=['open', 'high', 'low', 'close', 'volume'],
            start_dt=start_dt,
            end_dt=end_dt)

        if not arrays:
            return path

        periods = self.get_calendar_periods_range(start_dt, end_dt,
                                                  data_frequency)

        df = get_df_from_arrays(arrays, periods)

        if empty_rows_behavior is not 'ignore':
            nan_rows = df[df.isnull().T.any().T].index

            if len(nan_rows) > 0:
                dates = []
                previous_date = None
                for row_date in nan_rows.values:
                    row_date = pd.to_datetime(row_date)

                    if previous_date is None:
                        dates.append(row_date)

                    else:
                        seq_date = previous_date + get_delta(1, data_frequency)

                        if row_date > seq_date:
                            dates.append(previous_date)
                            dates.append(row_date)

                    previous_date = row_date

                dates.append(pd.to_datetime(nan_rows.values[-1]))

                name = path.split('/')[-1]
                if empty_rows_behavior == 'warn':
                    log.warn(
                        '\n{name} with end minute {end_minute} has empty rows '
                        'in ranges: {dates}'.format(
                            name=name,
                            end_minute=asset.end_minute,
                            dates=dates))

                elif empty_rows_behavior == 'raise':
                    raise EmptyValuesInBundleError(name=name,
                                                   end_minute=asset.end_minute,
                                                   dates=dates)
                else:
                    df.dropna(inplace=True)

        data = []
        if not df.empty:
            df.sort_index(inplace=True)
            data.append((asset.sid, df))
        self._write(data, writer, data_frequency)

        if cleanup:
            log.debug('removing bundle folder following '
                      'ingestion: {}'.format(path))
            shutil.rmtree(path)

        return path