Exemplo n.º 1
0
    def _spot_empty_periods(self, ohlcv_df, asset, data_frequency,
                            empty_rows_behavior):
        problems = []

        nan_rows = ohlcv_df[ohlcv_df.isnull().T.any().T].index
        if len(nan_rows) > 0:
            dates = []
            for row_date in nan_rows.values:
                row_date = pd.to_datetime(row_date, utc=True)
                if row_date > asset.start_date:
                    dates.append(row_date)

            if len(dates) > 0:
                end_dt = asset.end_minute if data_frequency == 'minute' \
                    else asset.end_daily

                problem = '{name} ({start_dt} to {end_dt}) has empty ' \
                          'periods: {dates}'.format(
                            name=asset.symbol,
                            start_dt=asset.start_date.strftime(
                                DATE_TIME_FORMAT),
                            end_dt=end_dt.strftime(DATE_TIME_FORMAT),
                            dates=[date.strftime(
                                DATE_TIME_FORMAT) for date in dates])

                if empty_rows_behavior == 'warn':
                    log.warn(problem)

                elif empty_rows_behavior == 'raise':
                    raise EmptyValuesInBundleError(
                        name=asset.symbol,
                        end_minute=end_dt,
                        dates=dates,
                    )

                else:
                    ohlcv_df.dropna(inplace=True)

            else:
                problem = None

            problems.append(problem)

        return problems
Exemplo n.º 2
0
    def ingest_df(self,
                  ohlcv_df,
                  data_frequency,
                  asset,
                  writer,
                  empty_rows_behavior='strip'):
        """
        Ingest a DataFrame of OHLCV data for a given market.

        Parameters
        ----------
        ohlcv_df: DataFrame
        data_frequency: str
        asset: TradingPair
        writer:
        empty_rows_behavior: str

        """
        if empty_rows_behavior is not 'ignore':
            nan_rows = ohlcv_df[ohlcv_df.isnull().T.any().T].index

            if len(nan_rows) > 0:
                dates = []
                previous_date = None
                for row_date in nan_rows.values:
                    row_date = pd.to_datetime(row_date)

                    if previous_date is None:
                        dates.append(row_date)

                    else:
                        seq_date = previous_date + get_delta(1, data_frequency)

                        if row_date > seq_date:
                            dates.append(previous_date)
                            dates.append(row_date)

                    previous_date = row_date

                dates.append(pd.to_datetime(nan_rows.values[-1]))

                name = '{} from {} to {}'.format(asset.symbol,
                                                 ohlcv_df.index[0],
                                                 ohlcv_df.index[-1])
                if empty_rows_behavior == 'warn':
                    log.warn(
                        '\n{name} with end minute {end_minute} has empty rows '
                        'in ranges: {dates}'.format(
                            name=name,
                            end_minute=asset.end_minute,
                            dates=dates))

                elif empty_rows_behavior == 'raise':
                    raise EmptyValuesInBundleError(name=name,
                                                   end_minute=asset.end_minute,
                                                   dates=dates)
                else:
                    ohlcv_df.dropna(inplace=True)

        data = []
        if not ohlcv_df.empty:
            ohlcv_df.sort_index(inplace=True)
            data.append((asset.sid, ohlcv_df))

        self._write(data, writer, data_frequency)
Exemplo n.º 3
0
    def ingest_ctable(self,
                      asset,
                      data_frequency,
                      period,
                      start_dt,
                      end_dt,
                      writer,
                      empty_rows_behavior='strip',
                      cleanup=False):
        """
        Merge a ctable bundle chunk into the main bundle for the exchange.

        :param asset: TradingPair
        :param data_frequency: str
        :param period: str
        :param writer:
        :param empty_rows_behavior: str
            Ensure that the bundle does not have any missing data.

        :param cleanup: bool
            Remove the temp bundle directory after ingestion.

        :return:
        """

        path = get_bcolz_chunk(exchange_name=self.exchange.name,
                               symbol=asset.symbol,
                               data_frequency=data_frequency,
                               period=period)

        reader = self.get_reader(data_frequency, path=path)
        if reader is None:
            raise TempBundleNotFoundError(path=path)

        arrays = reader.load_raw_arrays(
            sids=[asset.sid],
            fields=['open', 'high', 'low', 'close', 'volume'],
            start_dt=start_dt,
            end_dt=end_dt)

        if not arrays:
            return path

        periods = self.get_calendar_periods_range(start_dt, end_dt,
                                                  data_frequency)

        df = get_df_from_arrays(arrays, periods)

        if empty_rows_behavior is not 'ignore':
            nan_rows = df[df.isnull().T.any().T].index

            if len(nan_rows) > 0:
                dates = []
                previous_date = None
                for row_date in nan_rows.values:
                    row_date = pd.to_datetime(row_date)

                    if previous_date is None:
                        dates.append(row_date)

                    else:
                        seq_date = previous_date + get_delta(1, data_frequency)

                        if row_date > seq_date:
                            dates.append(previous_date)
                            dates.append(row_date)

                    previous_date = row_date

                dates.append(pd.to_datetime(nan_rows.values[-1]))

                name = path.split('/')[-1]
                if empty_rows_behavior == 'warn':
                    log.warn(
                        '\n{name} with end minute {end_minute} has empty rows '
                        'in ranges: {dates}'.format(
                            name=name,
                            end_minute=asset.end_minute,
                            dates=dates))

                elif empty_rows_behavior == 'raise':
                    raise EmptyValuesInBundleError(name=name,
                                                   end_minute=asset.end_minute,
                                                   dates=dates)
                else:
                    df.dropna(inplace=True)

        data = []
        if not df.empty:
            df.sort_index(inplace=True)
            data.append((asset.sid, df))
        self._write(data, writer, data_frequency)

        if cleanup:
            log.debug('removing bundle folder following '
                      'ingestion: {}'.format(path))
            shutil.rmtree(path)

        return path