def _init(self, *args, **kwargs): # not sure why I have to use this awkward convention to get the param code = kwargs["params"][0][1] f = io.StringIO() download_master_file(f, universes=code, fields="Sid") self._real_sids = pd.read_csv(f).Sid.tolist() return super(Universe, self)._init(*args, **kwargs)
def collect_combo(universe, contract_months, tick_db, until=None): """ Creates a combo and initiates real-time data collection for it. Parameters ---------- universe : str, required the universe of futures from which to select the contract contract_months contract_months : tuple, required a tuple of contract months which should make up the legs of the spread, for example, (1, 2) for the first and second closest months to expiration tick_db : str, required the tick db code for real-time data collection until : str, optional collect real-time data until this time (for example, '16:30:00 America/New_York') """ # query non-expired futures contracts f = io.StringIO() download_master_file(f, universes=universe, fields="RolloverDate", exclude_expired=True) contracts = pd.read_csv(f, index_col="RolloverDate", parse_dates=["RolloverDate"]) # sort by RolloverDate sids = contracts.Sid.sort_index().tolist() sid_1 = sids[contract_months[0] - 1] sid_2 = sids[contract_months[1] - 1] # Create the combo if it doesn't already exist result = create_ibkr_combo([["BUY", 1, sid_1], ["SELL", 1, sid_2]]) combo_sid = result["sid"] # initiate data collection collect_market_data(tick_db, sids=combo_sid, until=until)
def _get_securities(self): """ Queries the master db and gets a securities master file. """ f = io.StringIO() download_master_file(f, universes=self.universes, conids=self.conids, exclude_universes=self.exclude_universes, exclude_conids=self.exclude_conids, delisted=True, fields=[ "ConId", "PrimaryExchange", "Symbol", "SecType", "LocalSymbol", "LongName", "MinTick", "Multiplier", "LastTradeDate", "ContractMonth", "Timezone", "UnderConId" ]) self.securities = pd.read_csv( f, index_col="ConId").sort_values(by="Symbol")
def get_prices(codes, start_date=None, end_date=None, universes=None, conids=None, exclude_universes=None, exclude_conids=None, times=None, fields=None, timezone=None, infer_timezone=None, cont_fut=None, master_fields=None): """ Query one or more history databases and/or real-time aggregate databases and load prices into a DataFrame. For bar sizes smaller than 1-day, the resulting DataFrame will have a MultiIndex with levels (Field, Date, Time). For bar sizes of 1-day or larger, the MultiIndex will have levels (Field, Date). Parameters ---------- codes : str or list of str, required the code(s) of one or more databases to query. If multiple databases are specified, they must have the same bar size. List databses in order of priority (highest priority first). If multiple databases provide the same field for the same conid on the same datetime, the first database's value will be used. start_date : str (YYYY-MM-DD), optional limit to data on or after this date end_date : str (YYYY-MM-DD), optional limit to data on or before this date universes : list of str, optional limit to these universes (default is to return all securities in database) conids : list of int, optional limit to these conids exclude_universes : list of str, optional exclude these universes exclude_conids : list of int, optional exclude these conids times: list of str (HH:MM:SS), optional limit to these times, specified in the timezone of the relevant exchange. See additional information in the Notes section regarding the timezone to use. fields : list of str, optional only return these fields. (If querying multiple databases that have different fields, provide the complete list of desired fields; only the supported fields for each database will be queried.) timezone : str, optional convert timestamps to this timezone, for example America/New_York (see `pytz.all_timezones` for choices); ignored for non-intraday bar sizes infer_timezone : bool infer the timezone from the securities master Timezone field; defaults to True if using intraday bars and no `timezone` specified; ignored for non-intraday bars, or if `timezone` is passed cont_fut : str stitch futures into continuous contracts using this method (default is not to stitch together). Only applicable to history databases. Possible choices: concat master_fields : list of str, optional [DEPRECATED] append these fields from the securities master database (pass ['?'] or any invalid fieldname to see available fields). This parameter is deprecated and will be removed in a future release. For better performance, use `quantrocket.master.get_securities_reindexed_like` to get securities master data shaped like prices. Returns ------- DataFrame a MultiIndex Notes ----- The `times` parameter, if provided, is applied differently for history databases vs real-time aggregate databases. For history databases, the parameter is applied when querying the database. For real-time aggregate databases, the parameter is not applied when querying the database; rather, all available times are retrieved and the `times` filter is applied to the resulting DataFrame after casting it to the appropriate timezone (as inferred from the securities master Timezone field or as explicitly specified with the `timezone` parameter). The rationale for this behavior is that history databases store intraday data in the timezone of the relevant exchange whereas real-time aggregate databases store data in UTC. By applying the `times` filter as described, users can specify the times in the timezone of the relevant exchange for both types of databases. Examples -------- Load intraday prices: >>> prices = get_prices('stk-sample-5min', fields=["Close", "Volume"]) >>> prices.head() ConId 265598 38708077 Field Date Time Close 2017-07-26 09:30:00 153.62 2715.0 09:35:00 153.46 2730.0 09:40:00 153.21 2725.0 09:45:00 153.28 2725.0 09:50:00 153.18 2725.0 Isolate the closes: >>> closes = prices.loc["Close"] >>> closes.head() ConId 265598 38708077 Date Time 2017-07-26 09:30:00 153.62 2715.0 09:35:00 153.46 2730.0 09:40:00 153.21 2725.0 09:45:00 153.28 2725.0 09:50:00 153.18 2725.0 Isolate the 15:45:00 prices: >>> session_closes = closes.xs("15:45:00", level="Time") >>> session_closes.head() ConId 265598 38708077 Date 2017-07-26 153.29 2700.00 2017-07-27 150.10 2660.00 2017-07-28 149.43 2650.02 2017-07-31 148.99 2650.34 2017-08-01 149.72 2675.50 """ # Import pandas lazily since it can take a moment to import try: import pandas as pd except ImportError: raise ImportError("pandas must be installed to use this function") try: import pytz except ImportError: raise ImportError("pytz must be installed to use this function") if timezone and timezone not in pytz.all_timezones: raise ParameterError( "invalid timezone: {0} (see `pytz.all_timezones` for choices)".format( timezone)) dbs = codes if not isinstance(dbs, (list, tuple)): dbs = [dbs] fields = fields or [] if not isinstance(fields, (list, tuple)): fields = [fields] # separate history dbs from realtime dbs history_dbs = set(list_history_databases()) realtime_dbs = list_realtime_databases() realtime_agg_dbs = set(itertools.chain(*realtime_dbs.values())) history_dbs.intersection_update(set(dbs)) realtime_agg_dbs.intersection_update(set(dbs)) unknown_dbs = set(dbs) - history_dbs - realtime_agg_dbs if unknown_dbs: tick_dbs = set(realtime_dbs.keys()).intersection(unknown_dbs) # Improve error message if possible if tick_dbs: raise ParameterError("{} is a real-time tick database, only history databases or " "real-time aggregate databases are supported".format( ", ".join(tick_dbs))) raise ParameterError( "no history or real-time aggregate databases called {}".format( ", ".join(unknown_dbs))) db_bar_sizes = set() db_bar_sizes_parsed = set() db_domains = set() history_db_fields = {} realtime_db_fields = {} for db in history_dbs: db_config = get_history_db_config(db) bar_size = db_config.get("bar_size") db_bar_sizes.add(bar_size) # to validate uniform bar sizes, we need to parse them in case dbs # store different but equivalent timedelta strings. History db # strings may need massaging to be parsable. if bar_size.endswith("s"): # strip s from secs, mins, hours to get valid pandas timedelta bar_size = bar_size[:-1] elif bar_size == "1 week": bar_size = "7 day" elif bar_size == "1 month": bar_size = "30 day" db_bar_sizes_parsed.add(pd.Timedelta(bar_size)) db_domain = db_config.get("domain", "main") db_domains.add(db_domain) history_db_fields[db] = db_config.get("fields", []) for db in realtime_agg_dbs: db_config = get_realtime_db_config(db) bar_size = db_config.get("bar_size") db_bar_sizes.add(bar_size) db_bar_sizes_parsed.add(pd.Timedelta(bar_size)) # NOTE: aggregate dbs don't include a domain key; if available, this # would probably have to be obtained from the associated tick db. # This isn't an issue unless/until real-time data comes from multiple # vendors. As it now stands, real-time data is always from "main". db_domain = db_config.get("domain", "main") db_domains.add(db_domain) realtime_db_fields[db] = db_config.get("fields", []) if len(db_bar_sizes_parsed) > 1: raise ParameterError( "all databases must contain same bar size but {0} have different " "bar sizes: {1}".format(", ".join(dbs), ", ".join(db_bar_sizes)) ) if len(db_domains) > 1: raise ParameterError( "all databases must use the same securities master domain but {0} " "use different domains: {1}".format(", ".join(dbs), ", ".join(db_domains)) ) all_prices = [] for db in dbs: if db in history_dbs: # different DBs might support different fields so only request the # subset of supported fields fields_for_db = set(fields).intersection(set(history_db_fields[db])) kwargs = dict( start_date=start_date, end_date=end_date, universes=universes, conids=conids, exclude_universes=exclude_universes, exclude_conids=exclude_conids, times=times, cont_fut=cont_fut, fields=list(fields_for_db), tz_naive=False ) tmp_filepath = "{dir}{sep}history.{db}.{pid}.{time}.csv".format( dir=TMP_DIR, sep=os.path.sep, db=db, pid=os.getpid(), time=time.time()) try: download_history_file(db, tmp_filepath, **kwargs) except NoHistoricalData as e: # don't complain about NoHistoricalData if we're checking # multiple databases, unless none of them have data if len(dbs) == 1: raise else: continue prices = pd.read_csv(tmp_filepath) all_prices.append(prices) os.remove(tmp_filepath) if db in realtime_agg_dbs: fields_for_db = set(fields).intersection(set(realtime_db_fields[db])) kwargs = dict( start_date=start_date, end_date=end_date, universes=universes, conids=conids, exclude_universes=exclude_universes, exclude_conids=exclude_conids, fields=list(fields_for_db)) tmp_filepath = "{dir}{sep}realtime.{db}.{pid}.{time}.csv".format( dir=TMP_DIR, sep=os.path.sep, db=db, pid=os.getpid(), time=time.time()) try: download_market_data_file(db, tmp_filepath, **kwargs) except NoRealtimeData as e: # don't complain about NoRealtimeData if we're checking # multiple databases, unless none of them have data if len(dbs) == 1: raise else: continue prices = pd.read_csv(tmp_filepath) all_prices.append(prices) os.remove(tmp_filepath) # complain if multiple dbs and none had data if len(dbs) > 1 and not all_prices: raise NoHistoricalData("no price data matches the query parameters in any of {0}".format( ", ".join(dbs) )) prices = pd.concat(all_prices, sort=False) try: prices = prices.pivot(index="ConId", columns="Date").T except ValueError as e: if "duplicate" not in repr(e): raise # There are duplicates, likely due to querying multiple databases, # both of which return one or more identical dates for identical # conids. To resolve, we group by conid and date and take the first # available value for each field. This means that the orders of # [codes] matters. The use of groupby.first() instead of simply # drop_duplicates allows us to retain one field from one db and # another field from another db. grouped = prices.groupby(["ConId","Date"]) prices = pd.concat( dict([(col, grouped[col].first()) for col in prices.columns if col not in ("ConId","Date")]), axis=1) prices = prices.reset_index().pivot(index="ConId", columns="Date").T prices.index.set_names(["Field", "Date"], inplace=True) master_fields = master_fields or [] if master_fields: import warnings # DeprecationWarning is ignored by default but we want the user # to see it warnings.simplefilter("always", DeprecationWarning) warnings.warn( "`master_fields` parameter is deprecated and will be removed in a " "future release. For better performance, please use " "`quantrocket.master.get_securities_reindexed_like` " "to get securities master data shaped like prices.", DeprecationWarning) if isinstance(master_fields, tuple): master_fields = list(master_fields) elif not isinstance(master_fields, list): master_fields = [master_fields] # master fields that are required internally but shouldn't be returned to # the user (potentially Timezone) internal_master_fields = [] is_intraday = list(db_bar_sizes_parsed)[0] < pd.Timedelta("1 day") if is_intraday and not timezone and infer_timezone is not False: infer_timezone = True if not master_fields or "Timezone" not in master_fields: internal_master_fields.append("Timezone") # Next, get the master file if master_fields or internal_master_fields: conids = list(prices.columns) domain = list(db_domains)[0] if db_domains else None f = six.StringIO() download_master_file( f, conids=conids, fields=master_fields + internal_master_fields, domain=domain ) securities = pd.read_csv(f, index_col="ConId") if "Delisted" in securities.columns: securities.loc[:, "Delisted"] = securities.Delisted.astype(bool) if "Etf" in securities.columns: securities.loc[:, "Etf"] = securities.Etf.astype(bool) # Infer timezone if needed if not timezone and infer_timezone: timezones = securities.Timezone.unique() if len(timezones) > 1: raise ParameterError( "cannot infer timezone because multiple timezones are present " "in data, please specify timezone explicitly (timezones: {0})".format( ", ".join(timezones))) timezone = timezones[0] # Drop any internal-only fields if internal_master_fields: securities = securities.drop(internal_master_fields, axis=1) if not securities.empty: # Append securities, indexed to the min date, to allow easy ffill on demand securities = pd.DataFrame(securities.T, columns=prices.columns) securities.index.name = "Field" idx = pd.MultiIndex.from_product( (securities.index, [prices.index.get_level_values("Date").min()]), names=["Field", "Date"]) securities = securities.reindex(index=idx, level="Field") prices = pd.concat((prices, securities)) if is_intraday: dates = pd.to_datetime(prices.index.get_level_values("Date"), utc=True) if timezone: dates = dates.tz_convert(timezone) else: dates = pd.to_datetime(prices.index.get_level_values("Date")) prices.index = pd.MultiIndex.from_arrays(( prices.index.get_level_values("Field"), dates ), names=("Field", "Date")) # Split date and time dts = prices.index.get_level_values("Date") dates = pd.to_datetime(dts.date).tz_localize(None) # drop tz-aware in Date index prices.index = pd.MultiIndex.from_arrays( (prices.index.get_level_values("Field"), dates, dts.strftime("%H:%M:%S")), names=["Field", "Date", "Time"] ) # Align dates if there are any duplicate. Explanation: Suppose there are # two timezones represented in the data (e.g. history db in security # timezone vs real-time db in UTC). After parsing these dates into a # common timezone, they will align properly, but we pivoted before # parsing the dates (for performance reasons), so they may not be # aligned. Thus we need to dedupe the index. prices = prices.groupby(prices.index).first() prices.index = pd.MultiIndex.from_tuples(prices.index) prices.index.set_names(["Field", "Date", "Time"], inplace=True) # Drop time if not intraday if not is_intraday: prices.index = prices.index.droplevel("Time") return prices # If intraday, fill missing times so that each date has the same set of # times, allowing easier comparisons. Example implications: # - if history is retrieved intraday, this ensures that today will have NaN # entries for future times # - early close dates will have a full set of times, with NaNs after the # early close unique_fields = prices.index.get_level_values("Field").unique() unique_dates = prices.index.get_level_values("Date").unique() unique_times = prices.index.get_level_values("Time").unique() interpolated_index = None for field in unique_fields: if master_fields and field in master_fields: min_date = prices.loc[field].index.min() field_idx = pd.MultiIndex.from_tuples([(field,min_date[0], min_date[1])]) else: field_idx = pd.MultiIndex.from_product([[field], unique_dates, unique_times]).sort_values() if interpolated_index is None: interpolated_index = field_idx else: interpolated_index = interpolated_index.append(field_idx) prices = prices.reindex(interpolated_index) prices.index.set_names(["Field", "Date", "Time"], inplace=True) # Apply times filter if needed (see Notes in docstring) if times and realtime_agg_dbs: if not isinstance(times, (list, tuple)): times = [times] prices = prices.loc[prices.index.get_level_values("Time").isin(times)] return prices
def get_historical_prices(self, start_date, end_date=None, nlv=None, max_cache=None): """ Downloads historical prices from a history db. Downloads security details from the master db and broadcasts the values to be shaped like the historical prices. """ if start_date: start_date = self._get_start_date_with_lookback(start_date) dbs = self.DB if not isinstance(dbs, (list, tuple)): dbs = [self.DB] db_universes = set() db_bar_sizes = set() for db in dbs: db_config = get_db_config(db) universes = db_config.get("universes", None) if universes: db_universes.update(set(universes)) bar_size = db_config.get("bar_size") db_bar_sizes.add(bar_size) db_universes = list(db_universes) db_bar_sizes = list(db_bar_sizes) if len(db_bar_sizes) > 1: raise MoonshotParameterError( "databases must contain same bar size but have different bar sizes " "(databases: {0}; bar sizes: {1})".format( ", ".join(dbs), ", ".join(db_bar_sizes))) all_prices = [] for db in dbs: kwargs = dict(start_date=start_date, end_date=end_date, universes=self.UNIVERSES, conids=self.CONIDS, exclude_universes=self.EXCLUDE_UNIVERSES, exclude_conids=self.EXCLUDE_CONIDS, times=self.DB_TIME_FILTERS, cont_fut=self.CONT_FUT, fields=self.DB_FIELDS, tz_naive=False) if max_cache: prices = HistoryCache.load(db, kwargs, max_cache) if prices is not None: all_prices.append(prices) continue if max_cache: f = HistoryCache.get_filepath(db, kwargs) else: f = io.StringIO() download_history_file(db, f, **kwargs) prices = pd.read_csv(f) all_prices.append(prices) prices = pd.concat(all_prices) prices = prices.pivot(index="ConId", columns="Date").T prices.index.set_names(["Field", "Date"], inplace=True) # Next, get the master file universes = self.UNIVERSES conids = self.CONIDS if not conids and not universes: universes = db_universes if not universes: conids = list(prices.columns) f = io.StringIO() download_master_file(f, conids=conids, universes=universes, exclude_conids=self.EXCLUDE_CONIDS, exclude_universes=self.EXCLUDE_UNIVERSES, fields=self.MASTER_FIELDS) securities = pd.read_csv(f, index_col="ConId") nlv = nlv or self._get_nlv() if nlv: missing_nlvs = set(securities.Currency) - set(nlv.keys()) if missing_nlvs: raise ValueError( "NLV dict is missing values for required currencies: {0}". format(", ".join(missing_nlvs))) securities['Nlv'] = securities.apply( lambda row: nlv.get(row.Currency, None), axis=1) # Append securities, indexed to the min date, to allow easy ffill on demand securities = pd.DataFrame(securities.T, columns=prices.columns) securities.index.name = "Field" idx = pd.MultiIndex.from_product( (securities.index, [prices.index.get_level_values("Date").min()]), names=["Field", "Date"]) securities = securities.reindex(index=idx, level="Field") prices = pd.concat((prices, securities)) timezone = self.TIMEZONE or self._infer_timezone(prices) dates = pd.to_datetime(prices.index.get_level_values("Date"), utc=True) dates = dates.tz_convert(timezone) prices.index = pd.MultiIndex.from_arrays( (prices.index.get_level_values("Field"), dates), names=("Field", "Date")) # Split date and time dts = prices.index.get_level_values("Date") dates = pd.to_datetime(dts.date) dates.tz = timezone prices.index = pd.MultiIndex.from_arrays( (prices.index.get_level_values("Field"), dates, dts.strftime("%H:%M:%S")), names=["Field", "Date", "Time"]) if db_bar_sizes[0] in ("1 day", "1 week", "1 month"): prices.index = prices.index.droplevel("Time") return prices
def get_reuters_estimates_reindexed_like(reindex_like, codes, fields=["Actual"], period_types=["Q"], max_lag=None): """ Return a multiindex (Indicator, Field, Date) DataFrame of point-in-time Reuters estimates and actuals for one or more indicator codes, reindexed to match the index (dates) and columns (conids) of `reindex_like`. Estimates and actuals are forward-filled in order to provide the latest reading at any given date, indexed to the UpdatedDate field. UpdatedDate is shifted forward 1 day to avoid lookahead bias. Parameters ---------- reindex_like : DataFrame, required a DataFrame (usually of prices) with dates for the index and conids for the columns, to which the shape of the resulting DataFrame will be conformed codes : list of str, required the indicator code(s) to query. Use the `list_reuters_codes` function to see available codes. fields : list of str a list of fields to include in the resulting DataFrame. Defaults to simply including the Actual field. period_types : list of str, optional limit to these fiscal period types. Possible choices: A, Q, S, where A=Annual, Q=Quarterly, S=Semi-Annual. Default is Q/Quarterly. max_lag : str, optional maximum amount of time a data point can be used after the associated fiscal period has ended. Setting a limit can prevent using data that is reported long after the fiscal period ended, or can limit how far data is forward filled in the absence of subsequent data. Specify as a Pandas offset alias, e.g. '500D'. By default, no maximum limit is applied. Returns ------- DataFrame a multiindex (Indicator, Field, Date) DataFrame of estimates and actuals, shaped like the input DataFrame Examples -------- Query book value per share (code BVPS): >>> closes = prices.loc["Close"] >>> estimates = get_reuters_estimates_reindexed_like(closes, codes=["BVPS"]) >>> book_values_per_share = estimates.loc["BVPS"].loc["Actual"] """ try: import pandas as pd except ImportError: raise ImportError("pandas must be installed to use this function") index_levels = reindex_like.index.names if "Time" in index_levels: raise ParameterError( "reindex_like should not have 'Time' in index, please take a cross-section first, " "for example: `prices.loc['Close'].xs('15:45:00', level='Time')`") if index_levels != ["Date"]: raise ParameterError( "reindex_like must have index called 'Date', but has {0}".format( ",".join([str(name) for name in index_levels]))) if not hasattr(reindex_like.index, "date"): raise ParameterError("reindex_like must have a DatetimeIndex") conids = list(reindex_like.columns) start_date = reindex_like.index.min().date() # Since financial reports are sparse, start well before the reindex_like # min date start_date -= pd.Timedelta(days=365 + 180) start_date = start_date.isoformat() end_date = reindex_like.index.max().date().isoformat() f = six.StringIO() query_fields = list( fields ) # copy fields on Py2 or 3: https://stackoverflow.com/a/2612815/417414 if "UpdatedDate" not in query_fields: query_fields.append("UpdatedDate") download_reuters_estimates(codes, f, conids=conids, start_date=start_date, end_date=end_date, fields=query_fields, period_types=period_types) parse_dates = ["UpdatedDate", "FiscalPeriodEndDate"] if "AnnounceDate" in fields: parse_dates.append("AnnounceDate") estimates = pd.read_csv(f, parse_dates=parse_dates) # Drop records with no actuals estimates = estimates.loc[estimates.UpdatedDate.notnull()] # Convert UTC UpdatedDate to security timezone, and cast to date for # index f = six.StringIO() download_master_file(f, conids=list(estimates.ConId.unique()), delisted=True, fields=["Timezone"]) timezones = pd.read_csv(f, index_col="ConId") estimates = estimates.join(timezones, on="ConId") if estimates.Timezone.isnull().any(): conids_missing_timezones = list( estimates.ConId[estimates.Timezone.isnull()].unique()) raise MissingData( "timezones are missing for some conids so cannot convert UTC " "estimates to timezone of security (conids missing timezone: {0})". format(",".join([str(conid) for conid in conids_missing_timezones]))) # If only 1 timezone in data, use a faster method if len(estimates.Timezone.unique()) == 1: timezone = list(estimates.Timezone.unique())[0] estimates["Date"] = pd.to_datetime( estimates.UpdatedDate.values).tz_localize("UTC").tz_convert( timezone) else: estimates["Date"] = estimates.apply( lambda row: row.UpdatedDate.tz_localize("UTC").tz_convert( row.Timezone), axis=1) # Convert to dates (i.e. time = 00:00:00) estimates.loc[:, "Date"] = pd.to_datetime( estimates.Date.apply( lambda x: pd.datetime.combine(x, pd.Timestamp("00:00:00").time()))) # Drop any fields we don't need needed_fields = set(fields) needed_fields.update(set(("ConId", "Date", "Indicator"))) if max_lag: needed_fields.add("FiscalPeriodEndDate") unneeded_fields = set(estimates.columns) - needed_fields if unneeded_fields: estimates = estimates.drop(unneeded_fields, axis=1) # if reindex_like.index is tz-aware, make financials tz-aware so they can # be joined (tz-aware or tz-naive are both fine, as SourceDate represents # dates which are assumed to be in the local timezone of the reported # company) if reindex_like.index.tz: estimates.loc[:, "Date"] = estimates.Date.dt.tz_localize( reindex_like.index.tz.zone) deduped_updated_dates = estimates.Date.drop_duplicates() else: # joining to tz-naive requires using values, whereas joining to # tz-aware requires not using it. Why? deduped_updated_dates = estimates.Date.drop_duplicates().values # Create a unioned index of input DataFrame and UpdatedDate union_date_idx = reindex_like.index.union( deduped_updated_dates).sort_values() all_estimates = {} for code in codes: estimates_for_code = estimates.loc[estimates.Indicator == code] if estimates_for_code.empty: continue if "Indicator" not in fields: estimates_for_code = estimates_for_code.drop("Indicator", axis=1) # There might be duplicate UpdatedDates if a company announced # reports for several fiscal periods at once. In this case we keep # only the last value (i.e. latest fiscal period) estimates_for_code = estimates_for_code.drop_duplicates( subset=["ConId", "Date"], keep="last") estimates_for_code = estimates_for_code.pivot(index="ConId", columns="Date").T multiidx = pd.MultiIndex.from_product( (estimates_for_code.index.get_level_values(0).unique(), union_date_idx), names=["Field", "Date"]) estimates_for_code = estimates_for_code.reindex( index=multiidx, columns=reindex_like.columns) # estimates are sparse so ffill (one field at a time) all_fields_for_code = {} for field in estimates_for_code.index.get_level_values( "Field").unique(): field_for_code = estimates_for_code.loc[field].fillna( method="ffill") # Shift to avoid lookahead bias field_for_code = field_for_code.shift() all_fields_for_code[field] = field_for_code # Filter stale values if asked to if max_lag: fiscal_period_end_dates = all_fields_for_code[ "FiscalPeriodEndDate"] # subtract the max_lag from the index date to get the # earliest possible fiscal period end date for that row earliest_allowed_fiscal_period_end_dates = fiscal_period_end_dates.apply( lambda x: fiscal_period_end_dates.index - pd.Timedelta(max_lag )) within_max_timedelta = fiscal_period_end_dates.apply( pd.to_datetime) >= earliest_allowed_fiscal_period_end_dates for field, field_for_code in all_fields_for_code.items(): field_for_code = field_for_code.where(within_max_timedelta) all_fields_for_code[field] = field_for_code # Clean up if FiscalPeriodEndDate was just kept around for this purpose if "FiscalPeriodEndDate" not in fields: del all_fields_for_code["FiscalPeriodEndDate"] estimates_for_code = pd.concat(all_fields_for_code, names=["Field", "Date"]) # In cases the statements included dates not in the input # DataFrame, drop those now that we've ffilled extra_dates = union_date_idx.difference(reindex_like.index) if not extra_dates.empty: estimates_for_code.drop(extra_dates, axis=0, level="Date", inplace=True) all_estimates[code] = estimates_for_code estimates = pd.concat(all_estimates, names=["Indicator", "Field", "Date"]) return estimates
def _get_stockloan_data_reindexed_like(stockloan_func, stockloan_field, reindex_like, time=None): """ Common base function for get_shortable_shares_reindexed_like and get_borrow_fees_reindexed_like. """ try: import pandas as pd except ImportError: raise ImportError("pandas must be installed to use this function") index_levels = reindex_like.index.names if "Time" in index_levels: raise ParameterError( "reindex_like should not have 'Time' in index, please take a cross-section first, " "for example: `prices.loc['Close'].xs('15:45:00', level='Time')`") if index_levels != ["Date"]: raise ParameterError( "reindex_like must have index called 'Date', but has {0}".format( ",".join([str(name) for name in index_levels]))) if not hasattr(reindex_like.index, "date"): raise ParameterError("reindex_like must have a DatetimeIndex") conids = list(reindex_like.columns) start_date = reindex_like.index.min().date() # Stockloan data is sparse but batched in monthly files, so start >1-month # before the reindex_like min date start_date -= pd.Timedelta(days=45) start_date = start_date.isoformat() end_date = reindex_like.index.max().date().isoformat() f = six.StringIO() stockloan_func(f, conids=conids, start_date=start_date, end_date=end_date) stockloan_data = pd.read_csv(f) stockloan_data.loc[:, "Date"] = pd.to_datetime(stockloan_data.Date, utc=True) # Determine timezone, from: # - time param if provided # - else reindex_like.index.tz if set # - else component securities if all have same timezone timezone = None if time and " " in time: time, timezone = time.split(" ", 1) if reindex_like.index.tz and reindex_like.index.tz.zone != timezone: raise ParameterError(( "cannot use timezone {0} because reindex_like timezone is {1}, " "these must match".format(timezone, reindex_like.index.tz.zone))) if not timezone: if reindex_like.index.tz: timezone = reindex_like.index.tz.zone else: # try to infer from component securities f = six.StringIO() download_master_file(f, conids=list(stockloan_data.ConId.unique()), delisted=True, fields=["Timezone"]) security_timezones = pd.read_csv(f, index_col="ConId") security_timezones = list(security_timezones.Timezone.unique()) if len(security_timezones) > 1: raise ParameterError( "no timezone specified and cannot infer because multiple timezones are " "present in data, please specify timezone (timezones in data: {0})" .format(", ".join(security_timezones))) timezone = security_timezones[0] # Create an index of `reindex_like` dates at `time` if time: try: time = pd.Timestamp(time).time() except ValueError as e: raise ParameterError("could not parse time '{0}': {1}".format( time, str(e))) index_at_time = pd.Index(reindex_like.index.to_series().apply( lambda x: pd.datetime.combine(x, time))) else: index_at_time = reindex_like.index if not index_at_time.tz: index_at_time = index_at_time.tz_localize(timezone) index_at_time = index_at_time.tz_convert("UTC") stockloan_data = stockloan_data.pivot(index="ConId", columns="Date").T stockloan_data = stockloan_data.loc[stockloan_field] # Create a unioned index of requested times and stockloan data timestamps unioned_idx = index_at_time.union(stockloan_data.index) # performance boost: don't need to reindex and ffill earlier than the # stockloan data start date unioned_idx = unioned_idx[unioned_idx >= stockloan_data.index.min()] stockloan_data = stockloan_data.reindex(index=unioned_idx, columns=reindex_like.columns) stockloan_data = stockloan_data.fillna(method="ffill") # Keep only the requested times, now that we've ffilled stockloan_data = stockloan_data.reindex(index=index_at_time) # Replace index_at_time with the original reindex_like index (this needs # to be done because index_at_time is tz-aware and reindex_like may not # be, and because index_at_time uses the requested `time` if provided. # Since index_at_time was derived from reindex_like.index and has the # same shape, it is safe to replace it.) stockloan_data.index = reindex_like.index return stockloan_data
def get_historical_prices(codes, start_date=None, end_date=None, universes=None, conids=None, exclude_universes=None, exclude_conids=None, times=None, cont_fut=None, fields=None, master_fields=None, timezone=None, infer_timezone=None): """ Query one or more history databases and load prices into a DataFrame. For bar sizes smaller than 1-day, the resulting DataFrame will have a MultiIndex with levels (Field, Date, Time). For bar sizes of 1-day or larger, the MultiIndex will have levels (Field, Date). Parameters ---------- codes : str or list of str, required the code(s) of one or more databases to query. If multiple databases are specified, they must have the same bar size. start_date : str (YYYY-MM-DD), optional limit to history on or after this date end_date : str (YYYY-MM-DD), optional limit to history on or before this date universes : list of str, optional limit to these universes (default is to return all securities in database) conids : list of int, optional limit to these conids exclude_universes : list of str, optional exclude these universes exclude_conids : list of int, optional exclude these conids times: list of str (HH:MM:SS), optional limit to these times cont_fut : str stitch futures into continuous contracts using this method (default is not to stitch together). Possible choices: concat fields : list of str, optional only return these fields (pass ['?'] or any invalid fieldname to see available fields) master_fields : list of str, optional append these fields from the securities master database (pass ['?'] or any invalid fieldname to see available fields) timezone : str, optional convert timestamps to this timezone, for example America/New_York (see `pytz.all_timezones` for choices); ignored for non-intraday bar sizes infer_timezone : bool infer the timezone from the securities master Timezone field; defaults to True if using intraday bars and no `timezone` specified; ignored for non-intraday bars, or if `timezone` is passed Returns ------- DataFrame a MultiIndex Examples -------- Load intraday prices: >>> prices = get_historical_prices('stk-sample-5min', fields=["Close", "Volume"], timezone="America/New_York") >>> prices.head() ConId 265598 38708077 Field Date Time Close 2017-07-26 09:30:00 153.62 2715.0 09:35:00 153.46 2730.0 09:40:00 153.21 2725.0 09:45:00 153.28 2725.0 09:50:00 153.18 2725.0 Isolate the closes: >>> closes = prices.loc["Close"] >>> closes.head() ConId 265598 38708077 Date Time 2017-07-26 09:30:00 153.62 2715.0 09:35:00 153.46 2730.0 09:40:00 153.21 2725.0 09:45:00 153.28 2725.0 09:50:00 153.18 2725.0 Isolate the 15:45:00 prices: >>> session_closes = closes.xs("15:45:00", level="Time") >>> session_closes.head() ConId 265598 38708077 Date 2017-07-26 153.29 2700.00 2017-07-27 150.10 2660.00 2017-07-28 149.43 2650.02 2017-07-31 148.99 2650.34 2017-08-01 149.72 2675.50 """ # Import pandas lazily since it can take a moment to import try: import pandas as pd except ImportError: raise ImportError("pandas must be installed to use this function") try: import pytz except ImportError: raise ImportError("pytz must be installed to use this function") if timezone and timezone not in pytz.all_timezones: raise ParameterError( "invalid timezone: {0} (see `pytz.all_timezones` for choices)". format(timezone)) dbs = codes if not isinstance(dbs, (list, tuple)): dbs = [dbs] if master_fields: if isinstance(master_fields, tuple): master_fields = list(master_fields) elif not isinstance(master_fields, list): master_fields = [master_fields] db_universes = set() db_bar_sizes = set() for db in dbs: db_config = get_db_config(db) _db_universes = db_config.get("universes", None) if _db_universes: db_universes.update(set(_db_universes)) bar_size = db_config.get("bar_size") db_bar_sizes.add(bar_size) db_universes = list(db_universes) db_bar_sizes = list(db_bar_sizes) if len(db_bar_sizes) > 1: raise ParameterError( "all databases must contain same bar size but {0} have different " "bar sizes: {1}".format(", ".join(dbs), ", ".join(db_bar_sizes))) all_prices = [] for db in dbs: kwargs = dict(start_date=start_date, end_date=end_date, universes=universes, conids=conids, exclude_universes=exclude_universes, exclude_conids=exclude_conids, times=times, cont_fut=cont_fut, fields=fields, tz_naive=False) tmp_filepath = "{0}/history.{1}.{2}.{3}.csv".format( TMP_DIR, db, os.getpid(), time.time()) download_history_file(db, tmp_filepath, **kwargs) prices = pd.read_csv(tmp_filepath) all_prices.append(prices) os.remove(tmp_filepath) prices = pd.concat(all_prices) prices = prices.pivot(index="ConId", columns="Date").T prices.index.set_names(["Field", "Date"], inplace=True) is_intraday = db_bar_sizes[0] not in ("1 day", "1 week", "1 month") if is_intraday and not timezone and infer_timezone is not False: infer_timezone = True if not master_fields: master_fields = [] if "Timezone" not in master_fields: master_fields.append("Timezone") # Next, get the master file if master_fields: universes = universes conids = conids if not conids and not universes: universes = db_universes if not universes: conids = list(prices.columns) f = six.StringIO() download_master_file(f, conids=conids, universes=universes, exclude_conids=exclude_conids, exclude_universes=exclude_universes, fields=master_fields, delisted=True) securities = pd.read_csv(f, index_col="ConId") if "Delisted" in securities.columns: securities.loc[:, "Delisted"] = securities.Delisted.astype(bool) if "Etf" in securities.columns: securities.loc[:, "Etf"] = securities.Etf.astype(bool) # Append securities, indexed to the min date, to allow easy ffill on demand securities = pd.DataFrame(securities.T, columns=prices.columns) securities.index.name = "Field" idx = pd.MultiIndex.from_product( (securities.index, [prices.index.get_level_values("Date").min()]), names=["Field", "Date"]) securities = securities.reindex(index=idx, level="Field") prices = pd.concat((prices, securities)) if is_intraday: dates = pd.to_datetime(prices.index.get_level_values("Date"), utc=True) if not timezone and infer_timezone: timezone = _infer_timezone(prices) if timezone: dates = dates.tz_convert(timezone) else: dates = pd.to_datetime(prices.index.get_level_values("Date")) prices.index = pd.MultiIndex.from_arrays( (prices.index.get_level_values("Field"), dates), names=("Field", "Date")) # Split date and time dts = prices.index.get_level_values("Date") dates = pd.to_datetime(dts.date).tz_localize( None) # drop tz-aware in Date index prices.index = pd.MultiIndex.from_arrays( (prices.index.get_level_values("Field"), dates, dts.strftime("%H:%M:%S")), names=["Field", "Date", "Time"]) # Align dates if there are any duplicate. Explanation: Suppose there are # two timezones represented in the data. After parsing these dates into a # common timezone, they will align properly, but we pivoted before # parsing the dates (for performance reasons), so they may not be # aligned. Thus we need to dedupe the index. prices = prices.groupby(prices.index).first() prices.index = pd.MultiIndex.from_tuples(prices.index) prices.index.set_names(["Field", "Date", "Time"], inplace=True) # Drop time if not intraday if not is_intraday: prices.index = prices.index.droplevel("Time") return prices # If intraday, fill missing times so that each date has the same set of # times, allowing easier comparisons. Example implications: # - if history is retrieved intraday, this ensures that today will have NaN # entries for future times # - early close dates will have a full set of times, with NaNs after the # early close unique_fields = prices.index.get_level_values("Field").unique() unique_dates = prices.index.get_level_values("Date").unique() unique_times = prices.index.get_level_values("Time").unique() interpolated_index = None for field in unique_fields: if master_fields and field in master_fields: min_date = prices.loc[field].index.min() field_idx = pd.MultiIndex.from_tuples([(field, min_date[0], min_date[1])]) else: field_idx = pd.MultiIndex.from_product([[field], unique_dates, unique_times]) if interpolated_index is None: interpolated_index = field_idx else: interpolated_index = interpolated_index.append(field_idx) prices = prices.reindex(interpolated_index) prices.index.set_names(["Field", "Date", "Time"], inplace=True) return prices
def _get_historical_prices(self, start_date, end_date, nlv=None): """ Downloads historical prices from a history db. Downloads security details from the master db and broadcasts the values to be shaped like the historical prices. """ if start_date: start_date = self._get_start_date_with_lookback(start_date) f = io.StringIO() download_history_file(self.DB, f, start_date=start_date, end_date=end_date, universes=self.UNIVERSES, conids=self.CONIDS, exclude_universes=self.EXCLUDE_UNIVERSES, exclude_conids=self.EXCLUDE_CONIDS, times=self.DB_TIME_FILTERS, cont_fut=self.CONT_FUT, fields=self.DB_FIELDS, tz_naive=False) prices = pd.read_csv(f, parse_dates=["Date"]) prices = prices.pivot(index="ConId", columns="Date").T prices.index.set_names(["Field", "Date"], inplace=True) # Next, get the master file universes = self.UNIVERSES conids = self.CONIDS if not conids and not universes: universes = get_db_config(self.DB).get("universes", None) if not universes: conids = list(prices.columns) f = io.StringIO() download_master_file(f, conids=conids, universes=universes, exclude_conids=self.EXCLUDE_CONIDS, exclude_universes=self.EXCLUDE_UNIVERSES, fields=self.MASTER_FIELDS) securities = pd.read_csv(f, index_col="ConId") nlv = nlv or self._get_nlv() if nlv: missing_nlvs = set(securities.Currency) - set(nlv.keys()) if missing_nlvs: raise ValueError( "NLV dict is missing values for required currencies: {0}". format(", ".join(missing_nlvs))) securities['Nlv'] = securities.apply( lambda row: nlv.get(row.Currency, None), axis=1) securities = pd.DataFrame(securities.T, columns=prices.columns) securities.index.name = "Field" idx = pd.MultiIndex.from_product( (securities.index, prices.index.get_level_values("Date").unique()), names=["Field", "Date"]) broadcast_securities = securities.reindex(index=idx, level="Field") prices = pd.concat((prices, broadcast_securities)) return prices
def _write_assets(self, prices, asset_db_writer): """ Queries the master service and prepares the securities for the asset writer. """ f = io.StringIO() download_master_file(f, universes=self.universes, delisted=True, fields=[ "ConId", "PrimaryExchange", "Symbol", "SecType", "LocalSymbol", "LongName", "MinTick", "Multiplier", "LastTradeDate", "ContractMonth", "Timezone", "UnderConId" ]) assets = pd.read_csv(f, index_col="ConId") assets[["Symbol", "LocalSymbol"]] = assets[["Symbol", "LocalSymbol"]].astype(str) grouped_by_conid = prices.reset_index().groupby("ConId") max_dates = grouped_by_conid.Date.max() min_dates = grouped_by_conid.Date.min() min_dates.name = "start_date" max_dates.name = "end_date" assets = assets.join(min_dates).join(max_dates) assets["first_traded"] = assets["start_date"] exchanges = pd.DataFrame(assets, columns=["PrimaryExchange", "Timezone"]).drop_duplicates() exchanges = exchanges.rename(columns={ "PrimaryExchange": "exchange", "Timezone": "timezone" }) equities = assets[assets.SecType == "STK"].copy() if equities.empty: equities = None else: equities = pd.DataFrame(equities, columns=[ "PrimaryExchange", "Symbol", "LongName", "start_date", "end_date", "first_traded" ]) equities = equities.rename( columns={ "PrimaryExchange": "exchange", "Symbol": "symbol", "LongName": "asset_name" }) futures = assets[assets.SecType == "FUT"].copy() if futures.empty: futures = None root_symbols = None else: # Concat local symbol plus contract month to ensure unique symbol futures["symbol"] = futures.LocalSymbol.str.cat( futures.ContractMonth.astype(str), "-") futures = futures.rename( columns={ "PrimaryExchange": "exchange", "Symbol": "root_symbol", "LongName": "asset_name", "Multiplier": "multiplier", "MinTick": "tick_size", "LastTradeDate": "auto_close_date", "UnderConId": "root_symbol_id" }) futures["expiration_date"] = futures.auto_close_date root_symbols = pd.DataFrame( futures, columns=["root_symbol", "exchange", "root_symbol_id"]).drop_duplicates() futures = futures.drop( ["root_symbol_id", "LocalSymbol", "ContractMonth", "SecType"], axis=1) asset_db_writer.write(equities=equities, futures=futures, exchanges=exchanges, root_symbols=root_symbols)