def filter_existing_assets(self, assets, start_dt, end_dt, data_frequency): """ For each asset, get the close on the start and end dates of the chunk. If the data exists, the chunk ingestion is complete. If any data is missing we ingest the data. Parameters ---------- assets: list[TradingPair] The assets is scope. start_dt: pd.Timestamp The chunk start date. end_dt: pd.Timestamp The chunk end date. data_frequency: str Returns ------- list[TradingPair] The assets missing from the bundle """ reader = self.get_reader(data_frequency) missing_assets = [] for asset in assets: has_data = range_in_bundle(asset, start_dt, end_dt, reader) if not has_data: missing_assets.append(asset) return missing_assets
def get_history_window_series(self, assets, end_dt, bar_count, field, data_frequency, reset_reader=False): start_dt = get_start_dt(end_dt, bar_count, data_frequency, False) start_dt, _ = self.get_adj_dates( start_dt, end_dt, assets, data_frequency ) # This is an attempt to resolve some caching with the reader # when auto-ingesting data. # TODO: needs more work reader = self.get_reader(data_frequency) if reset_reader: del self._readers[reader._rootdir] reader = self.get_reader(data_frequency) if reader is None: symbols = [asset.symbol for asset in assets] raise PricingDataNotLoadedError( field=field, first_trading_day=min([asset.start_date for asset in assets]), exchange=self.exchange_name, symbols=symbols, symbol_list=','.join(symbols), data_frequency=data_frequency, start_dt=start_dt, end_dt=end_dt ) series = dict() for asset in assets: asset_start_dt, _ = self.get_adj_dates( start_dt, end_dt, assets, data_frequency ) in_bundle = range_in_bundle( asset, asset_start_dt, end_dt, reader ) if not in_bundle: raise PricingDataNotLoadedError( field=field, first_trading_day=asset.start_date, exchange=self.exchange_name, symbols=asset.symbol, symbol_list=asset.symbol, data_frequency=data_frequency, start_dt=asset_start_dt, end_dt=end_dt ) periods = self.get_calendar_periods_range( asset_start_dt, end_dt, data_frequency ) # This does not behave well when requesting multiple assets # when the start or end date of one asset is outside of the range # looking at the logic in load_raw_arrays(), we are not achieving # any performance gain by requesting multiple sids at once. It's # looping through the sids and making separate requests anyway. arrays = reader.load_raw_arrays( sids=[asset.sid], fields=[field], start_dt=start_dt, end_dt=end_dt ) if len(arrays) == 0: raise DataCorruptionError( exchange=self.exchange_name, symbols=asset.symbol, start_dt=asset_start_dt, end_dt=end_dt ) field_values = arrays[0][:, 0] try: value_series = pd.Series(field_values, index=periods) series[asset] = value_series except ValueError as e: raise PricingDataValueError( exchange=asset.exchange, symbol=asset.symbol, start_dt=asset_start_dt, end_dt=end_dt, error=e ) return series
def prepare_chunks(self, assets, data_frequency, start_dt, end_dt): """ Split a price data request into chunks corresponding to individual bundles. Parameters ---------- assets: list[TradingPair] data_frequency: str start_dt: pd.Timestamp end_dt: pd.Timestamp Returns ------- dict[TradingPair, list[dict(str, Object]]] """ get_start_end = get_month_start_end \ if data_frequency == 'minute' else get_year_start_end # Get a reader for the main bundle to verify if data exists reader = self.get_reader(data_frequency) chunks = dict() for asset in assets: try: # Checking if the the asset has price data in the specified # date range adj_start, adj_end = self.get_adj_dates( start_dt, end_dt, [asset], data_frequency ) except NoDataAvailableOnExchange as e: # If not, we continue to the next asset log.debug('skipping {}: {}'.format(asset.symbol, e)) continue dates = pd.date_range( start=get_period_label(adj_start, data_frequency), end=get_period_label(adj_end, data_frequency), freq='MS' if data_frequency == 'minute' else 'AS', tz=UTC ) # Adjusting the last date of the range to avoid # going over the asset's trading bounds dates.values[0] = adj_start dates.values[-1] = adj_end chunks[asset] = [] for index, dt in enumerate(dates): period_start, period_end = get_start_end( dt=dt, first_day=dt if index == 0 else None, last_day=dt if index == len(dates) - 1 else None ) # Currencies don't always start trading at midnight. # Checking the last minute of the day instead. range_start = period_start.replace(hour=23, minute=59) \ if data_frequency == 'minute' else period_start # Checking if the data already exists in the bundle # for the date range of the chunk. If not, we create # a chunk for ingestion. has_data = range_in_bundle( asset, range_start, period_end, reader ) if not has_data: period = get_period_label(dt, data_frequency) chunk = dict( asset=asset, period=period, ) chunks[asset].append(chunk) # We sort the chunks by end date to ingest most recent data first chunks[asset].sort( key=lambda chunk: pd.to_datetime(chunk['period']) ) return chunks