def test_get_freq_code(self): # frequency str assert (frequencies.get_freq_code('A') == (frequencies.get_freq('A'), 1)) assert (frequencies.get_freq_code('3D') == (frequencies.get_freq('D'), 3)) assert (frequencies.get_freq_code('-2M') == (frequencies.get_freq('M'), -2)) # tuple assert (frequencies.get_freq_code( ('D', 1)) == (frequencies.get_freq('D'), 1)) assert (frequencies.get_freq_code( ('A', 3)) == (frequencies.get_freq('A'), 3)) assert (frequencies.get_freq_code( ('M', -2)) == (frequencies.get_freq('M'), -2)) # numeric tuple assert frequencies.get_freq_code((1000, 1)) == (1000, 1) # offsets assert (frequencies.get_freq_code( offsets.Day()) == (frequencies.get_freq('D'), 1)) assert (frequencies.get_freq_code( offsets.Day(3)) == (frequencies.get_freq('D'), 3)) assert (frequencies.get_freq_code( offsets.Day(-2)) == (frequencies.get_freq('D'), -2)) assert (frequencies.get_freq_code( offsets.MonthEnd()) == (frequencies.get_freq('M'), 1)) assert (frequencies.get_freq_code( offsets.MonthEnd(3)) == (frequencies.get_freq('M'), 3)) assert (frequencies.get_freq_code( offsets.MonthEnd(-2)) == (frequencies.get_freq('M'), -2)) assert (frequencies.get_freq_code( offsets.Week()) == (frequencies.get_freq('W'), 1)) assert (frequencies.get_freq_code( offsets.Week(3)) == (frequencies.get_freq('W'), 3)) assert (frequencies.get_freq_code( offsets.Week(-2)) == (frequencies.get_freq('W'), -2)) # Monday is weekday=0 assert (frequencies.get_freq_code( offsets.Week(weekday=1)) == (frequencies.get_freq('W-TUE'), 1)) assert (frequencies.get_freq_code(offsets.Week( 3, weekday=0)) == (frequencies.get_freq('W-MON'), 3)) assert (frequencies.get_freq_code(offsets.Week( -2, weekday=4)) == (frequencies.get_freq('W-FRI'), -2))
def test_get_freq_code(self): # freqstr self.assertEqual(frequencies.get_freq_code('A'), (frequencies.get_freq('A'), 1)) self.assertEqual(frequencies.get_freq_code('3D'), (frequencies.get_freq('D'), 3)) self.assertEqual(frequencies.get_freq_code('-2M'), (frequencies.get_freq('M'), -2)) # tuple self.assertEqual(frequencies.get_freq_code(('D', 1)), (frequencies.get_freq('D'), 1)) self.assertEqual(frequencies.get_freq_code(('A', 3)), (frequencies.get_freq('A'), 3)) self.assertEqual(frequencies.get_freq_code(('M', -2)), (frequencies.get_freq('M'), -2)) # numeric tuple self.assertEqual(frequencies.get_freq_code((1000, 1)), (1000, 1)) # offsets self.assertEqual(frequencies.get_freq_code(offsets.Day()), (frequencies.get_freq('D'), 1)) self.assertEqual(frequencies.get_freq_code(offsets.Day(3)), (frequencies.get_freq('D'), 3)) self.assertEqual(frequencies.get_freq_code(offsets.Day(-2)), (frequencies.get_freq('D'), -2)) self.assertEqual(frequencies.get_freq_code(offsets.MonthEnd()), (frequencies.get_freq('M'), 1)) self.assertEqual(frequencies.get_freq_code(offsets.MonthEnd(3)), (frequencies.get_freq('M'), 3)) self.assertEqual(frequencies.get_freq_code(offsets.MonthEnd(-2)), (frequencies.get_freq('M'), -2)) self.assertEqual(frequencies.get_freq_code(offsets.Week()), (frequencies.get_freq('W'), 1)) self.assertEqual(frequencies.get_freq_code(offsets.Week(3)), (frequencies.get_freq('W'), 3)) self.assertEqual(frequencies.get_freq_code(offsets.Week(-2)), (frequencies.get_freq('W'), -2)) # monday is weekday=0 self.assertEqual(frequencies.get_freq_code(offsets.Week(weekday=1)), (frequencies.get_freq('W-TUE'), 1)) self.assertEqual(frequencies.get_freq_code(offsets.Week(3, weekday=0)), (frequencies.get_freq('W-MON'), 3)) self.assertEqual( frequencies.get_freq_code(offsets.Week(-2, weekday=4)), (frequencies.get_freq('W-FRI'), -2))
def test_is_superperiod_subperiod(): assert (frequencies.is_superperiod(offsets.YearEnd(), offsets.MonthEnd())) assert (frequencies.is_subperiod(offsets.MonthEnd(), offsets.YearEnd())) assert (frequencies.is_superperiod(offsets.Hour(), offsets.Minute())) assert (frequencies.is_subperiod(offsets.Minute(), offsets.Hour())) assert (frequencies.is_superperiod(offsets.Second(), offsets.Milli())) assert (frequencies.is_subperiod(offsets.Milli(), offsets.Second())) assert (frequencies.is_superperiod(offsets.Milli(), offsets.Micro())) assert (frequencies.is_subperiod(offsets.Micro(), offsets.Milli())) assert (frequencies.is_superperiod(offsets.Micro(), offsets.Nano())) assert (frequencies.is_subperiod(offsets.Nano(), offsets.Micro()))
def project_dates(arg_base, arg_count): result = [ pd.to_datetime( datetime.date(arg_base.year, arg_base.month, 1) + offsets.MonthEnd(month)) for month in range(1, arg_count + 1) ] return result
def update_quarterly_data(self, stockslist=None, date=None, start_date=None, end_date=None): inds_to_update = ('assetstoequity', 'cashtocurrentdebt', 'current', 'longdebttodebt', 'grossprofitmargin_ttm2', 'longdebttoequity', 'qfa_deductedprofit', 'orps', 'eps_diluted2', 'qfa_grossprofitmargin', 'qfa_netprofitmargin', 'qfa_net_cash_flows_oper_act', 'qfa_net_profit_is', 'qfa_oper_rev', 'qfa_roa', 'qfa_roe', 'qfa_yoyocf', 'qfa_yoyprofit', 'qfa_yoysales', 'roa2_ttm2', 'roe_ttm2', 'stm_issuingdate', 'turnover_ttm', 'tot_equity', 'tot_liab', 'tot_assets', 'other_equity_instruments_PRE') curdate = toffsets.datetime.now().date() if date is None else date offset = curdate.month % 3 if (curdate.month % 3 != 0) else ( curdate.month % 3 + 3) ndate = curdate - toffsets.MonthEnd(n=offset) for qname in inds_to_update: new_cols, new_data = self.update_ori_data(qname, 'q', stockslist, date, start_date, end_date) if new_cols: self.close_file(new_data, qname) print("\"{}\" data updated to date {}.".format( qname, str(ndate)[:10])) else: print(f"\"{qname}\"'s data don't need to be updated.")
def get_date_ranges(self, start, end, scale='daily', include_bounds=True): ''' Returns a list of dates sampled according to the specified parameters. Parameters ---------- start: str First date that will be included. end: str Last date that will be included scale: {'daily', 'weekly', 'monthly', 'quarterly', 'yearly'} Scale specifies the sampling intervals. include_bounds: boolean Include start and end in the result if they are not included yet. ''' if scale not in ['daily', 'weekly', 'monthly', 'quarterly', 'yearly']: raise ValueError('Incorrect scale: %s' % scale) start = Timestamp(start) end = Timestamp(end) freq = dict(weekly='W', monthly='M', quarterly='3M', yearly='12M') offset = dict(weekly=off.Week(), monthly=off.MonthEnd(), quarterly=off.QuarterEnd(), yearly=off.YearEnd()) if scale == 'daily': ret = pd.date_range(start, end, freq='D') else: ret = pd.date_range(start + offset[scale], end, freq=freq[scale]) ret = list(ret) if include_bounds: if start not in ret: ret = [start] + ret if end not in ret: ret = ret + [end] return ret
def load_shiller(): """Load market & macroeconomic data from Robert Shiller's website. Returns ------- iedata : pd.DataFrame Time series of S&P 500 and interest rate variables. Example ------- >>> from pyfinance import datasets >>> shiller = datasets.load_shiller() >>> shiller.iloc[:7, :5] sp50p sp50d sp50e cpi real_rate date 1871-01-31 4.44 0.26 0.4 12.4641 5.3200 1871-02-28 4.50 0.26 0.4 12.8446 5.3233 1871-03-31 4.61 0.26 0.4 13.0350 5.3267 1871-04-30 4.74 0.26 0.4 12.5592 5.3300 1871-05-31 4.86 0.26 0.4 12.2738 5.3333 1871-06-30 4.82 0.26 0.4 12.0835 5.3367 1871-07-31 4.73 0.26 0.4 12.0835 5.3400 .. _ONLINE DATA ROBERT SHILLER: http://www.econ.yale.edu/~shiller/data.htm """ xls = 'http://www.econ.yale.edu/~shiller/data/ie_data.xls' cols = ['date', 'sp50p', 'sp50d', 'sp50e', 'cpi', 'frac', 'real_rate', 'real_sp50p', 'real_sp50d', 'real_sp50e', 'cape'] iedata = pd.read_excel(xls, sheet_name='Data', skiprows=7, skip_footer=1, names=cols).drop('frac', axis=1) dt = iedata['date'].astype(str).str.replace('.', '') + '01' iedata['date'] = pd.to_datetime(dt, format="%Y%m%d") + offsets.MonthEnd() return iedata.set_index('date')
def get_dates_range(self, scale='auto', start=None, end=None, date_max='2010-01-01'): ''' Returns a list of dates sampled according to the specified parameters. :param scale: {'auto', 'maximum', 'daily', 'weekly', 'monthly', 'quarterly', 'yearly'} Scale specifies the sampling intervals. 'auto' will heuristically choose a scale for quick processing :param start: First date that will be included. :param end: Last date that will be included ''' if scale not in [ 'auto', 'maximum', 'daily', 'weekly', 'monthly', 'quarterly', 'yearly' ]: raise ValueError('Incorrect scale: %s' % scale) start = Timestamp(start or self._start.min() or date_max) # FIXME: start != start is true for NaN objects... is NaT the same? start = Timestamp(date_max) if repr(start) == 'NaT' else start end = Timestamp(end or max(Timestamp(self._end.max()), self._start.max())) # FIXME: end != end ? end = datetime.utcnow() if repr(end) == 'NaT' else end start = start if self.check_in_bounds(start) else self._lbound end = end if self.check_in_bounds(end) else self._rbound if scale == 'auto': scale = self._auto_select_scale(start, end) if scale == 'maximum': start_dts = list(self._start.dropna().values) end_dts = list(self._end.dropna().values) dts = map(Timestamp, set(start_dts + end_dts)) dts = filter( lambda ts: self.check_in_bounds(ts) and ts >= start and ts <= end, dts) return dts freq = dict(daily='D', weekly='W', monthly='M', quarterly='3M', yearly='12M') offset = dict(daily=off.Day(n=0), weekly=off.Week(), monthly=off.MonthEnd(), quarterly=off.QuarterEnd(), yearly=off.YearEnd()) # for some reason, weekly date range gives one week less: end_ = end + off.Week() if scale == 'weekly' else end ret = list(pd.date_range(start + offset[scale], end_, freq=freq[scale])) ret = [dt for dt in ret if dt <= end] ret = [start] + ret if ret and start < ret[0] else ret ret = ret + [end] if ret and end > ret[-1] else ret ret = filter(lambda ts: self.check_in_bounds(ts), ret) return ret
def get_clusters(origin): year = int(str(origin)[:4]) with engine.connect() as conn: sql = f"select 1 from clusters where date_part('year', date)={year}" res = conn.execute(sql).fetchone() if res is None: return None, None # offset = BusinessMonthEnd() offset = offsets.MonthEnd() end = offset.rollforward(origin.to_timestamp()).strftime('%Y-%m-%d') sql = f""" select ticker, date, mtd_1mf, vals from clusters where look_12m=TRUE and date between '1995-12-29' and '{end}' order by date asc -- if not for sequencing, something to ensure consistent (same) clusters used each time """ df = pd.read_sql(sql, conn, index_col=['date', 'ticker']).sort_index() # unpack vals from col of lists to their own cols X = [x for x in df.vals.values] df = concat_x_cols(df.drop('vals', 1), X) df['y', 'mtd_1mf'] = df.y.mtd_1mf return df
def load_shiller(pickle_from=None, pickle_to=None): """Load data from Robert Shiller's website. Description: http://www.econ.yale.edu/~shiller/data.htm Examples ======== shiller = load_shiller() shiller = shiller[shiller.index.month % 3 == 0] """ link = 'http://www.econ.yale.edu/~shiller/data/ie_data.xls' iedata = (read_excel(link, sheetname='Data', skiprows=range( 0, 7)).loc[:, :'CAPE'].dropna(subset=['Date']).drop('Fraction', axis=1)) cols = [ 'date', 'sp50p', 'sp50d', 'sp50e', 'cpi', 'real_rate', 'real_sp50p', 'real_sp50d', 'real_sp50e', 'cape' ] iedata.columns = cols iedata.loc[:, 'date'] = (pd.to_datetime( (iedata.date.astype(str).str.replace('.', '') + '01'), format="%Y%m%d") + offsets.MonthEnd(1)) iedata.set_index('date', inplace=True) return iedata
def update_monthly_data(self, stockslist=None, date=None, start_date=None, end_date=None): inds_to_update = ('sec_name1', 'industry_citic', 'industry_citic_level2', 'mkt_cap_float', 'pe_ttm', 'val_pe_deducted_ttm', 'ps_ttm', 'pb_lf', 'profit_ttm', 'pcf_ncf_ttm', 'pcf_ocf_ttm', 'dividendyield2', 'or_ttm', 'deductedprofit_ttm', 'ocfps_ttm', 'eps_ttm', 'holder_num', 'holder_avgpct', 'pct_chg_M') # self.update_quarterly_data() # self.qdata_to_mdata((start_date and end_date)) # curdate = toffsets.datetime.now().date() if date is None else date ndate = curdate - toffsets.MonthEnd(n=1) for qname in inds_to_update: new_cols, new_data = self.update_ori_data(qname, 'M', stockslist, date, start_date, end_date) if new_cols: if len(new_cols) == 1: fill_cols = new_data.columns[-2:] new_data.loc[:, fill_cols] = new_data.loc[:, fill_cols].\ fillna(axis=1, method='ffill') self.close_file(new_data, qname) print("\"{}\" data updated to date {}.".format( qname, str(ndate)[:10])) else: print(f"\"{qname}\"'s data don't need to be updated.") #profit_ttm_G profit_ttm_G = self.profit_ttm.T / self.profit_ttm.T.shift(12) - 1 profit_ttm_G = profit_ttm_G.T.dropna(how='all', axis=1) self.close_file(profit_ttm_G, "profit_ttm_G") print("'profit_ttm_G' updated.") #holder_avgpctchg holder_avgpct_cal = 1000 / self.holder_num holder_avgpct_cal, holder_avgpct_get = self._align_element( holder_avgpct_cal, self.holder_avgpct) orival, fillval = holder_avgpct_get.values, holder_avgpct_cal.values newval = np.where(np.isnan(orival), fillval, orival) holder_avgpct_fill = pd.DataFrame(newval, index=holder_avgpct_get.index, columns=holder_avgpct_get.columns) self.close_file(holder_avgpct_fill, "holder_avgpct_fill") h_fill = holder_avgpct_fill.T holder_avgpctchg = h_fill / h_fill.shift(12) - 1 holder_avgpctchg = holder_avgpctchg.T.dropna(how='all', axis=1) self.close_file(holder_avgpctchg, "holder_avgpctchg") print("'holder_avgpct' updated.")
def data_move_test(): s = pd.Series(np.random.randn(6), index=pd.date_range('1/1/2019', periods=6, freq='M')) print('原数据 \r\n', s) # 单纯的前后移动(数据移动,产生缺失数据) print('数据往后移动 \r\n', s.shift(2)) print('数据往前移动 \r\n', s.shift(-2)) print('后移动 freg参数,根据频率移动,实际对时间戳进行位移而不是对数据进行位移 \r\n', s.shift(2, freq='M')) print('前移动 freg参数\r\n', s.shift(-2, freq='D')) now = datetime.today() print('datetim 今天:\r\n', now) print('datetim 偏移 3天\r\n', now + 3 * offset.Day()) print('datetim 偏移 到本月底\r\n', now + offset.MonthEnd()) print('datetim期偏移 第2月后的月底\r\n', now + offset.MonthEnd(2)) print('rollforward 向前滚到当月底 \r\n', offset.MonthEnd().rollforward(now)) print('rollforward 向后滚到上月底\r\n', offset.MonthEnd().rollback(now)) print('Series的时间戳 向前滚到月底\r\n', s.groupby(offset.MonthEnd().rollforward).count())
def test_is_superperiod_subperiod(): # input validation assert not (frequencies.is_superperiod(offsets.YearEnd(), None)) assert not (frequencies.is_subperiod(offsets.MonthEnd(), None)) assert not (frequencies.is_superperiod(None, offsets.YearEnd())) assert not (frequencies.is_subperiod(None, offsets.MonthEnd())) assert not (frequencies.is_superperiod(None, None)) assert not (frequencies.is_subperiod(None, None)) assert (frequencies.is_superperiod(offsets.YearEnd(), offsets.MonthEnd())) assert (frequencies.is_subperiod(offsets.MonthEnd(), offsets.YearEnd())) assert (frequencies.is_superperiod(offsets.Hour(), offsets.Minute())) assert (frequencies.is_subperiod(offsets.Minute(), offsets.Hour())) assert (frequencies.is_superperiod(offsets.Second(), offsets.Milli())) assert (frequencies.is_subperiod(offsets.Milli(), offsets.Second())) assert (frequencies.is_superperiod(offsets.Milli(), offsets.Micro())) assert (frequencies.is_subperiod(offsets.Micro(), offsets.Milli())) assert (frequencies.is_superperiod(offsets.Micro(), offsets.Nano())) assert (frequencies.is_subperiod(offsets.Nano(), offsets.Micro()))
def regress_by_store(df): ret_list = [] month_ends = pd.date_range(start='01/01/2016', end='05/01/2017', freq='M') for month_end in month_ends: quarter_start = month_end - offsets.MonthBegin(3) quarter_df = take_df_by_valid_period(df, quarter_start, month_end) if quarter_df.empty: continue next_month_start = month_end + offsets.MonthBegin(1) next_month_end = month_end + offsets.MonthEnd(1) next_month_df = take_df_by_period(df, next_month_start, next_month_end) if next_month_df.empty: continue quarter_y_pred = do_regression(quarter_df, next_month_df) year_start = month_end - offsets.MonthBegin(12) year_df = take_df_by_valid_period(df, year_start, month_end) year_y_pred = do_regression(year_df, next_month_df) temp_df = pd.DataFrame(index=next_month_df.index) temp_df["quarter_regress_no_dow"] = quarter_y_pred temp_df["year_regress_no_dow"] = year_y_pred ret_list.append(temp_df) return ret_list
def load_rf( freq='M', pickle_from=None, pickle_to=None, ): """Build a risk-free rate return series using 3-month US T-bill yields. The 3-Month Treasury Bill: Secondary Market Rate from the Federal Reserve (a yield) is convert to a total return. See 'Methodology' for details. The time series should closely mimic returns of the BofA Merrill Lynch US Treasury Bill (3M) (Local Total Return) index. Parameters ========== reload : bool, default False If False, use pickled data. If True, reload from source freq : str, sequence, or set If a single-character string, return a single-column DataFrame with index frequency corresponding to `freq`. If a sequence or set, return a dict of DataFrames with the keys corresponding to `freq`(s) Methodology =========== The Federal Reserve publishes a daily chart of Selected Interest Rates (release H.15; www.federalreserve.gov/releases/h15/). As with a yield curve, some yields are interpolated from recent issues because Treasury auctions do not occur daily. While the de-annualized ex-ante yield itself is a fairly good tracker of the day's total return, it is not perfect and can exhibit non-neglible error in periods of volatile short rates. The purpose of this function is to convert yields to total returns for 3-month T-bills. It is a straightforward process given that these are discount (zero-coupon) securities. It consists of buying a 3-month bond at the beginning of each month, then amortizing that bond throughout the month to back into the price of a <3-month tenor bond. The source data (pulled from fred.stlouisfed.org) is quoted on a discount basis. (See footnote 4 from release H.15.) This is converted to a bond-equivlanet yield (BEY) and then translated to a hypothetical daily total return. The process largely follows Morningstar's published Return Calculation of U.S. Treasury Constant Maturity Indices, and is as follows: - At the beginning of each month a bill is purchased at the prior month-end price, and daily returns in the month reflect the change in daily valuation of this bill - If t is not a business day, its yield is the yield of the prior business day. - At each day during the month, the price of a 3-month bill purchased on the final calendar day of the previous month is computed. - Month-end pricing is unique. At each month-end date, there are effectively two bonds and two prices. The first is the bond hypothetically purchased on the final day of the prior month with 2m remaining to maturity, and the second is a new-issue bond purchased that day with 3m to maturity. The former is used as the numerator to compute that day's total return, while the latter is used as the denominator to compute the next day's (1st day of next month) total return. Description of the BofA Merrill Lynch US 3-Month Treasury Bill Index: The BofA Merrill Lynch US 3-Month Treasury Bill Index is comprised of a single issue purchased at the beginning of the month and held for a full month. At the end of the month that issue is sold and rolled into a newly selected issue. The issue selected at each month-end rebalancing is the outstanding Treasury Bill that matures closest to, but not beyond, three months from the rebalancing date. To qualify for selection, an issue must have settled on or before the month-end rebalancing date. (Source: Bank of America Merrill Lynch) See also ======== FRED: 3-Month Treasury Bill: Secondary Market Rate (DTB3) https://fred.stlouisfed.org/series/DTB3 McGraw-Hill/Irwin, Interest Rates, 2008. https://people.ucsc.edu/~lbaum/econ80h/LS-Chap009.pdf Morningstar, Return Calculation of U.S. Treasury Constant Maturity Indices, September 2008. """ # Validate `freq` param freqs = list('DWMQA') freq = freq.upper() if freq.islower() else freq if freq not in freqs: raise ValueError('`freq` must be either a single element or subset' ' from %s, case-insensitive' % freqs) # Load daily 3-Month Treasury Bill: Secondary Market Rate # Note that this is on discount basis and will be converted to BEY # Periodicity is daily rates = dr('DTB3', 'fred', DSTART) * 0.01 rates = (rates.asfreq('D', method='ffill').fillna(method='ffill').squeeze()) # Algebra doesn't 'work' on DateOffsets, don't simplify here! trigger = rates.index.is_month_end dtm_old = rates.index + offsets.MonthEnd(-1) + offsets.MonthEnd(3) \ - rates.index dtm_new = rates.index.where(trigger, rates.index + offsets.MonthEnd(-1)) \ + offsets.MonthEnd(3) - rates.index # This does 2 things in one step: # (1) convert discount yield to BEY # (2) get the price at that BEY and days to maturity # The two equations are simplified # See https://people.ucsc.edu/~lbaum/econ80h/LS-Chap009.pdf p_old = (100 / 360) * (360 - rates * dtm_old.days) p_new = (100 / 360) * (360 - rates * dtm_new.days) res = p_old.pct_change().where(trigger, p_new.pct_change()) res = returns.prep(res, in_format='dec', name='RF', freq='D') if freq != 'D': res = returns.prep(dr.rollup(out_freq=freq), in_format='dec', freq=freq) return res
def test_pickle_freq(self): # GH#2891 prng = period_range("1/1/2011", "1/1/2012", freq="M") new_prng = tm.round_trip_pickle(prng) assert new_prng.freq == offsets.MonthEnd() assert new_prng.freqstr == "M"
lst = fc_table.columns.tolist() # ## 年月表記の Timestamp を月末の日付に # In[ ]: import pandas.tseries.offsets as offsets # In[ ]: pd.to_datetime(bs_table['決算期'], format='%Y.%m') + offsets.MonthEnd() # ## 特定の行(列)の削除 # In[ ]: # 行 3 と 4 を削除 df.drop([3,4]) # In[ ]: # 列 A を削除
yearfirst=True).date()) qr_table['発表日'] = qr_table.loc[ qr_table['発表日'].str.match('\d\d/\d\d/\d\d'), '発表日'].apply(lambda x: parse(x, yearfirst=True).date()) # pandasのTimestampへの型変換 qr_table['Q期首'] = pd.to_datetime(qr_table['Q期首'], format='%Y-%m-%d') qr_table['発表日'] = pd.to_datetime(qr_table['発表日'], format='%Y-%m-%d') # 通期業績の決算期を参照して決算期列を追加 # 通期業績の予想値削除前に別名でキープした決算期シリーズを利用 for start_idx, start in qr_table['Q期首'].iteritems(): for end in pl_end: if start < pd.to_datetime( end, format='%Y.%m') + offsets.MonthEnd(): qr_table.loc[start_idx, '決算期'] = end break # 数値の列の数値以外の文字列 ('-' 等) を NaN に置換 num_col = ('Q売上高', 'Q営業益', 'Q経常益', 'Q最終益', 'Q1株益', 'Q売上営業損益率') for key in num_col: if qr_table[key].dtypes == object: qr_table.loc[~qr_table[key].str.replace(r'\.|\-', "" ).str.isnumeric(), key] = np.nan # .str を2回も使わないといけないのはなんだか。。。 # qr_table.loc[qr_table[key].str.contains('-'), key] = np.nan # この書き方だと '-' 以外の文字列に対応できないので不安 # 型変換 # 辞書内包表記による一括変換 qr_table = qr_table.astype({
def test_is_superperiod_subperiod(): assert (fmod.is_superperiod(offsets.YearEnd(), offsets.MonthEnd())) assert (fmod.is_subperiod(offsets.MonthEnd(), offsets.YearEnd()))
("-2M", (get_freq("M"), -2)), # Tuple. (("D", 1), (get_freq("D"), 1)), (("A", 3), (get_freq("A"), 3)), (("M", -2), (get_freq("M"), -2)), ((5, "T"), (FreqGroup.FR_MIN, 5)), # Numeric Tuple. ((1000, 1), (1000, 1)), # Offsets. (offsets.Day(), (get_freq("D"), 1)), (offsets.Day(3), (get_freq("D"), 3)), (offsets.Day(-2), (get_freq("D"), -2)), (offsets.MonthEnd(), (get_freq("M"), 1)), (offsets.MonthEnd(3), (get_freq("M"), 3)), (offsets.MonthEnd(-2), (get_freq("M"), -2)), (offsets.Week(), (get_freq("W"), 1)), (offsets.Week(3), (get_freq("W"), 3)), (offsets.Week(-2), (get_freq("W"), -2)), (offsets.Hour(), (FreqGroup.FR_HR, 1)), # Monday is weekday=0. (offsets.Week(weekday=1), (get_freq("W-TUE"), 1)), (offsets.Week(3, weekday=0), (get_freq("W-MON"), 3)), (offsets.Week(-2, weekday=4), (get_freq("W-FRI"), -2)), ]) def test_get_freq_code(freq_input, expected): assert get_freq_code(freq_input) == expected
# Frequency string. ("A", (get_freq_code("A")[0], 1)), ("3D", (get_freq_code("D")[0], 3)), ("-2M", (get_freq_code("M")[0], -2)), # Tuple. (("D", 1), (get_freq_code("D")[0], 1)), (("A", 3), (get_freq_code("A")[0], 3)), (("M", -2), (get_freq_code("M")[0], -2)), ((5, "T"), (FreqGroup.FR_MIN, 5)), # Numeric Tuple. ((1000, 1), (1000, 1)), # Offsets. (offsets.Day(), (get_freq_code("D")[0], 1)), (offsets.Day(3), (get_freq_code("D")[0], 3)), (offsets.Day(-2), (get_freq_code("D")[0], -2)), (offsets.MonthEnd(), (get_freq_code("M")[0], 1)), (offsets.MonthEnd(3), (get_freq_code("M")[0], 3)), (offsets.MonthEnd(-2), (get_freq_code("M")[0], -2)), (offsets.Week(), (get_freq_code("W")[0], 1)), (offsets.Week(3), (get_freq_code("W")[0], 3)), (offsets.Week(-2), (get_freq_code("W")[0], -2)), (offsets.Hour(), (FreqGroup.FR_HR, 1)), # Monday is weekday=0. (offsets.Week(weekday=1), (get_freq_code("W-TUE")[0], 1)), (offsets.Week(3, weekday=0), (get_freq_code("W-MON")[0], 3)), (offsets.Week(-2, weekday=4), (get_freq_code("W-FRI")[0], -2)), ], ) def test_get_freq_code(freq_input, expected): assert get_freq_code(freq_input) == expected
def load_factors(): """Load risk factor returns. Factors ------- Symbol Description Source ------ ---------- ------ MKT French SMB Size (small minus big) French HML Value (high minus low) French RMW Profitability (robust minus weak) French CMA Investment (conservative minus aggressive) French UMD Momentum (up minus down) French STR Short-term reversal French LTR Long-term reversal French BETA Beta French ACC Accruals French VAR Variance French IVAR Residual variance French EP Earnings-to-price French CP Cash flow-to-price French DP Dividend-to-price French BAB Betting against beta AQR QMJ Quality minus junk AQR HMLD Value (high minus low) [modified version] AQR LIQ Liquidity Pastor BDLB Bond lookback straddle Hsieh FXLB Curency lookback straddle Hsieh CMLB Commodity lookback straddle Hsieh IRLB Interest rate lookback straddle Hsieh STLB Stock lookback straddle Hsieh PUT CBOE S&P 500 PutWrite Index CBOE BXM CBOE S&P 500 BuyWrite Index® CBOE RXM CBOE S&P 500 Risk Reversal Index CBOE Source Directory ---------------- Source Link ------ ---- French http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html # noqa Pastor http://faculty.chicagobooth.edu/lubos.pastor/research/liq_data_1962_2016.txt # noqa AQR https://www.aqr.com/library/data-sets Hsieh https://faculty.fuqua.duke.edu/~dah7/HFData.htm Fed https://fred.stlouisfed.org/ CBOE http://www.cboe.com/products/strategy-benchmark-indexes """ # TODO: factors elegible for addition # VIIX, VIIZ, XIV, ZIV, CRP (AQR) # http://www.cboe.com/micro/buywrite/monthendpricehistory.xls ends 2016 # could use: # http://www.cboe.com/publish/scheduledtask/mktdata/datahouse/putdailyprice.csv # Warning: slow, kludgy data retrieval follows # ------------------------------------------------------------------------ # `tgt` will become a list of DataFrames and eventually concatenated tgt = [] # MKT, SMB, HML, RMW, CMA, RF, UMD, STR, LTR facs = [ "F-F_Research_Data_5_Factors_2x3", "F-F_Momentum_Factor", "F-F_ST_Reversal_Factor", "F-F_LT_Reversal_Factor", ] for fac in facs: tgt.append(pdr.DataReader(fac, "famafrench", DSTART)[0]) # BETA, ACC, VAR, IVAR require some manipulation to compute returns # in the dual-sort method of Fama-French for i in ["BETA", "AC", "VAR", "RESVAR"]: ser = pdr.DataReader( "25_Portfolios_ME_" + i + "_5x5", "famafrench", DSTART )[0] ser = ser.iloc[:, [0, 5, 10, 15, 20]].mean(axis=1) - ser.iloc[ :, [4, 9, 14, 19, 24] ].mean(axis=1) ser = ser.rename(i) tgt.append(ser) # E/P, CF/P, D/P (univariate sorts, quintile spreads) for i in ["E-P", "CF-P", "D-P"]: ser = pdr.DataReader( "Portfolios_Formed_on_" + i, "famafrench", DSTART )[0] ser = ser.loc[:, "Hi 20"] - ser.loc[:, "Lo 20"] ser = ser.rename(i) tgt.append(ser) tgt = [df.to_timestamp(how="end") for df in tgt] # BAB, QMJ, HMLD # TODO: performance is poor here, runtime is eaten up by these 3 links = { "BAB": "http://bit.ly/2hWyaG8", "QMJ": "http://bit.ly/2hUBSgF", "HMLD": "http://bit.ly/2hdVb7G", } for key, value in links.items(): ser = pd.read_excel(value, header=18, index_col=0)["USA"] * 100 ser = ser.rename(key) tgt.append(ser) # Lookback straddles link = "http://faculty.fuqua.duke.edu/~dah7/DataLibrary/TF-Fac.xls" straddles = pd.read_excel(link, header=14, index_col=0) straddles.index = pd.DatetimeIndex( straddles.index.astype(str) + "01" ) + offsets.MonthEnd(1) straddles = straddles * 100.0 tgt.append(straddles) # LIQ link = "http://bit.ly/2pn2oBK" liq = pd.read_csv( link, skiprows=14, delim_whitespace=True, header=None, usecols=[0, 3], index_col=0, names=["date", "LIQ"], ) liq.index = pd.DatetimeIndex( liq.index.astype(str) + "01" ) + offsets.MonthEnd(1) liq = liq.replace(-99, np.nan) * 100.0 tgt.append(liq) # USD, HY fred = pdr.DataReader(["DTWEXB", "BAMLH0A0HYM2"], "fred", DSTART) fred = fred.asfreq("D", method="ffill").fillna(method="ffill").asfreq("M") fred.loc[:, "DTWEXB"] = fred["DTWEXB"].pct_change() * 100.0 fred.loc[:, "BAMLH0A0HYM2"] = fred["BAMLH0A0HYM2"].diff() tgt.append(fred) # PUT, BXM, RXM (CBOE options strategy indices) link1 = "http://www.cboe.com/micro/put/put_86-06.xls" link2 = "http://www.cboe.com/publish/scheduledtask/mktdata/datahouse/putdailyprice.csv" # noqa put1 = pd.read_excel( link1, index_col=0, skiprows=6, header=None ).rename_axis("DATE") put2 = pd.read_csv( link2, index_col=0, parse_dates=True, skiprows=7, header=None ).rename_axis("DATE") put = ( pd.concat((put1, put2)) .rename(columns={1: "PUT"}) .iloc[:, 0] .asfreq("D", method="ffill") .fillna(method="ffill") .asfreq("M") .pct_change() * 100.0 ) tgt.append(put) link1 = "http://www.cboe.com/publish/scheduledtask/mktdata/datahouse/bxmarchive.csv" # noqa link2 = "http://www.cboe.com/publish/scheduledtask/mktdata/datahouse/bxmcurrent.csv" # noqa bxm1 = pd.read_csv( link1, index_col=0, parse_dates=True, skiprows=5, header=None ).rename_axis("DATE") bxm2 = pd.read_csv( link2, index_col=0, parse_dates=True, skiprows=4, header=None ).rename_axis("DATE") bxm = ( pd.concat((bxm1, bxm2)) .rename(columns={1: "BXM"}) .iloc[:, 0] .asfreq("D", method="ffill") .fillna(method="ffill") .asfreq("M") .pct_change() * 100.0 ) tgt.append(bxm) link = "http://www.cboe.com/publish/scheduledtask/mktdata/datahouse/rxm_historical.csv" # noqa rxm = ( pd.read_csv( link, index_col=0, parse_dates=True, skiprows=2, header=None ) .rename(columns={1: "RXM"}) .rename_axis("DATE") .iloc[:, 0] .asfreq("D", method="ffill") .fillna(method="ffill") .asfreq("M") .pct_change() * 100.0 ) tgt.append(rxm) # Clean up data retrieved above # ----------------------------------------------------------------- factors = pd.concat(tgt, axis=1).round(2) newnames = { "Mkt-RF": "MKT", "Mom ": "UMD", "ST_Rev": "STR", "LT_Rev": "LTR", "RESVAR": "IVAR", "AC": "ACC", "PTFSBD": "BDLB", "PTFSFX": "FXLB", "PTFSCOM": "CMLB", "PTFSIR": "IRLB", "PTFSSTK": "STLB", "DTWEXB": "USD", "BAMLH0A0HYM2": "HY", } factors.rename(columns=newnames, inplace=True) # Get last valid RF date; returns will be constrained to this date factors = factors[: factors["RF"].last_valid_index()] # Subtract RF for long-only factors subtract = ["HY", "PUT", "BXM", "RXM"] for i in subtract: factors.loc[:, i] = factors[i] - factors["RF"] return factors
def _get_month_end(self, date): _, days = calendar.monthrange(date.year, date.month) if date.day == days: return date else: return date + toffsets.MonthEnd(n=1)
def make_date(arg_year, arg_month): result = pd.to_datetime( datetime.date(arg_year, arg_month, 1) + offsets.MonthEnd(0)) return result
("W-FRI", 4005), ("Min", 8000), ("ms", 10000), ("US", 11000), ("NS", 12000), ], ) def test_period_str_to_code(obj, expected): assert _period_str_to_code(obj) == expected @pytest.mark.parametrize( "p1,p2,expected", [ # Input validation. (offsets.MonthEnd(), None, False), (offsets.YearEnd(), None, False), (None, offsets.YearEnd(), False), (None, offsets.MonthEnd(), False), (None, None, False), (offsets.YearEnd(), offsets.MonthEnd(), True), (offsets.Hour(), offsets.Minute(), True), (offsets.Second(), offsets.Milli(), True), (offsets.Milli(), offsets.Micro(), True), (offsets.Micro(), offsets.Nano(), True), ], ) def test_super_sub_symmetry(p1, p2, expected): assert is_superperiod(p1, p2) is expected assert is_subperiod(p2, p1) is expected
def load_retaildata(pickle_from=None, pickle_to=None): """Monthly retail trade data from census.gov.""" # full = 'https://www.census.gov/retail/mrts/www/mrtssales92-present.xls' # indiv = 'https://www.census.gov/retail/marts/www/timeseries.html' db = { 'Auto, other Motor Vehicle': 'https://www.census.gov/retail/marts/www/adv441x0.txt', 'Building Material and Garden Equipment and Supplies Dealers': 'https://www.census.gov/retail/marts/www/adv44400.txt', 'Clothing and Clothing Accessories Stores': 'https://www.census.gov/retail/marts/www/adv44800.txt', 'Dept. Stores (ex. leased depts)': 'https://www.census.gov/retail/marts/www/adv45210.txt', 'Electronics and Appliance Stores': 'https://www.census.gov/retail/marts/www/adv44300.txt', 'Food Services and Drinking Places': 'https://www.census.gov/retail/marts/www/adv72200.txt', 'Food and Beverage Stores': 'https://www.census.gov/retail/marts/www/adv44500.txt', 'Furniture and Home Furnishings Stores': 'https://www.census.gov/retail/marts/www/adv44200.txt', 'Gasoline Stations': 'https://www.census.gov/retail/marts/www/adv44700.txt', 'General Merchandise Stores': 'https://www.census.gov/retail/marts/www/adv45200.txt', 'Grocery Stores': 'https://www.census.gov/retail/marts/www/adv44510.txt', 'Health and Personal Care Stores': 'https://www.census.gov/retail/marts/www/adv44600.txt', 'Miscellaneous Store Retailers': 'https://www.census.gov/retail/marts/www/adv45300.txt', 'Motor Vehicle and Parts Dealers': 'https://www.census.gov/retail/marts/www/adv44100.txt', 'Nonstore Retailers': 'https://www.census.gov/retail/marts/www/adv45400.txt', 'Retail and Food Services, total': 'https://www.census.gov/retail/marts/www/adv44x72.txt', 'Retail, total': 'https://www.census.gov/retail/marts/www/adv44000.txt', 'Sporting Goods, Hobby, Book, and Music Stores': 'https://www.census.gov/retail/marts/www/adv45100.txt', 'Total (excl. Motor Vehicle)': 'https://www.census.gov/retail/marts/www/adv44y72.txt', 'Retail (excl. Motor Vehicle and Parts Dealers)': 'https://www.census.gov/retail/marts/www/adv4400a.txt' } dct = {} for key, value in db.items(): data = read_csv(value, skiprows=5, skip_blank_lines=True, header=None, sep='\s+', index_col=0) try: cut = data.index.get_loc('SEASONAL') except KeyError: cut = data.index.get_loc('NO') data = data.iloc[:cut] data = data.apply(lambda col: pd.to_numeric(col, downcast='float')) data = data.stack() year = data.index.get_level_values(0) month = data.index.get_level_values(1) idx = pd.to_datetime({'year' : year, 'month' : month, 'day' : 1}) \ + offsets.MonthEnd(1) data.index = idx data.name = key dct[key] = data sales = DataFrame(dct) sales = sales.reindex( pd.date_range(sales.index[0], sales.index[-1], freq='M')) # TODO: account for any skipped months; could specify a DateOffset to # `freq` param of `pandas.DataFrame.shift` yoy = sales.pct_change(periods=12) return sales, yoy
def load_factors(pickle_from=None, pickle_to=None): """Load risk factor returns. Factors ======= Symbol Description Source ------ ---------- ------ MKT French SMB Size (small minus big) French HML Value (high minus low) French RMW Profitability (robust minus weak) French CMA Investment (conservative minus aggressive) French UMD Momentum (up minus down) French STR Short-term reversal French LTR Long-term reversal French BETA Beta French ACC Accruals French VAR Variance French IVAR Residual variance French EP Earnings-to-price French CP Cash flow-to-price French DP Dividend-to-price French BAB Betting against beta AQR QMJ Quality minus junk AQR HMLD Value (high minus low) [modified version] AQR LIQ Liquidity Pastor BDLB Bond lookback straddle Hsieh FXLB Curency lookback straddle Hsieh CMLB Commodity lookback straddle Hsieh IRLB Interest rate lookback straddle Hsieh STLB Stock lookback straddle Hsieh PUT CBOE S&P 500 PutWrite Index CBOE BXM CBOE S&P 500 BuyWrite Index® CBOE RXM CBOE S&P 500 Risk Reversal Index CBOE Source Directory ================ Source Link ------ ---- French http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html Pastor http://faculty.chicagobooth.edu/lubos.pastor/research/liq_data_1962_2016.txt AQR https://www.aqr.com/library/data-sets Hsieh https://faculty.fuqua.duke.edu/~dah7/HFData.htm Fed https://fred.stlouisfed.org/ CBOE http://www.cboe.com/products/strategy-benchmark-indexes """ # TODO: factors elegible for addition # VIIX, VIIZ, XIV, ZIV, CRP (AQR) # http://www.cboe.com/micro/buywrite/monthendpricehistory.xls ends 2016 # could use: # http://www.cboe.com/publish/scheduledtask/mktdata/datahouse/putdailyprice.csv # Warning: slow, kludgy data retrieval follows # ------------------------------------------------------------------------ # `tgt` will become a list of DataFrames and eventually concatenated tgt = [] # MKT, SMB, HML, RMW, CMA, RF, UMD, STR, LTR facs = [ 'F-F_Research_Data_5_Factors_2x3', 'F-F_Momentum_Factor', 'F-F_ST_Reversal_Factor', 'F-F_LT_Reversal_Factor' ] for fac in facs: tgt.append(dr(fac, 'famafrench', DSTART)[0]) # BETA, ACC, VAR, IVAR require some manipulation to compute returns # in the dual-sort method of Fama-French for i in ['BETA', 'AC', 'VAR', 'RESVAR']: ser = dr('25_Portfolios_ME_' + i + '_5x5', 'famafrench', DSTART)[0] ser = (ser.iloc[:, [0, 5, 10, 15, 20]].mean(axis=1) - ser.iloc[:, [4, 9, 14, 19, 24]].mean(axis=1)) ser = ser.rename(i) tgt.append(ser) # E/P, CF/P, D/P (univariate sorts, quintile spreads) for i in ['E-P', 'CF-P', 'D-P']: ser = dr('Portfolios_Formed_on_' + i, 'famafrench', DSTART)[0] ser = ser.loc[:, 'Hi 20'] - ser.loc[:, 'Lo 20'] ser = ser.rename(i) tgt.append(ser) tgt = [df.to_timestamp(how='end') for df in tgt] # BAB, QMJ, HMLD # TODO: performance is poor here, runtime is eaten up by these 3 links = { 'BAB': 'http://bit.ly/2hWyaG8', 'QMJ': 'http://bit.ly/2hUBSgF', 'HMLD': 'http://bit.ly/2hdVb7G' } for key, value in links.items(): ser = read_excel(value, header=18, index_col=0)['USA'] * 100 ser = ser.rename(key) tgt.append(ser) # Lookback straddles link = 'http://faculty.fuqua.duke.edu/~dah7/DataLibrary/TF-Fac.xls' straddles = read_excel(link, header=14, index_col=0) straddles.index = (pd.DatetimeIndex(straddles.index.astype(str) + '01') + offsets.MonthEnd(1)) straddles = straddles * 100. tgt.append(straddles) # LIQ link = 'http://bit.ly/2pn2oBK' liq = read_csv(link, skiprows=14, delim_whitespace=True, header=None, usecols=[0, 3], index_col=0, names=['date', 'LIQ']) liq.index = (pd.DatetimeIndex(liq.index.astype(str) + '01') + offsets.MonthEnd(1)) liq = liq.replace(-99, np.nan) * 100. tgt.append(liq) # USD, HY fred = dr(['DTWEXB', 'BAMLH0A0HYM2'], 'fred', DSTART) # daily default fred = (fred.asfreq('D', method='ffill').fillna(method='ffill').asfreq('M')) fred.loc[:, 'DTWEXB'] = fred['DTWEXB'].pct_change() * 100. fred.loc[:, 'BAMLH0A0HYM2'] = fred['BAMLH0A0HYM2'].diff() tgt.append(fred) # PUT, BXM, RXM (CBOE options strategy indices) link1 = 'http://www.cboe.com/micro/put/put_86-06.xls' link2 = 'http://www.cboe.com/publish/scheduledtask/mktdata/datahouse/putdailyprice.csv' put1 = (read_excel(link1, index_col=0, skiprows=6, header=None).rename_axis('DATE')) put2 = read_csv(link2, index_col=0, parse_dates=True, skiprows=7, header=None).rename_axis('DATE') put = (pd.concat((put1, put2)).rename(columns={ 1: 'PUT' }).iloc[:, 0].asfreq( 'D', method='ffill').fillna(method='ffill').asfreq('M').pct_change() * 100.) tgt.append(put) link1 = 'http://www.cboe.com/publish/scheduledtask/mktdata/datahouse/bxmarchive.csv' link2 = 'http://www.cboe.com/publish/scheduledtask/mktdata/datahouse/bxmcurrent.csv' bxm1 = read_csv(link1, index_col=0, parse_dates=True, skiprows=5, header=None).rename_axis('DATE') bxm2 = read_csv(link2, index_col=0, parse_dates=True, skiprows=4, header=None).rename_axis('DATE') bxm = (pd.concat((bxm1, bxm2)).rename(columns={ 1: 'BXM' }).iloc[:, 0].asfreq( 'D', method='ffill').fillna(method='ffill').asfreq('M').pct_change() * 100.) tgt.append(bxm) link = 'http://www.cboe.com/publish/scheduledtask/mktdata/datahouse/rxm_historical.csv' rxm = (read_csv( link, index_col=0, parse_dates=True, skiprows=2, header=None).rename(columns={ 1: 'RXM' }).rename_axis('DATE').iloc[:, 0].asfreq('D', method='ffill').fillna( method='ffill').asfreq('M').pct_change() * 100.) tgt.append(rxm) # Clean up data retrieved above # ------------------------------------------------------------------------ factors = pd.concat(tgt, axis=1).round(2) newnames = { 'Mkt-RF': 'MKT', 'Mom ': 'UMD', 'ST_Rev': 'STR', 'LT_Rev': 'LTR', 'RESVAR': 'IVAR', 'AC': 'ACC', 'PTFSBD': 'BDLB', 'PTFSFX': 'FXLB', 'PTFSCOM': 'CMLB', 'PTFSIR': 'IRLB', 'PTFSSTK': 'STLB', 'DTWEXB': 'USD', 'BAMLH0A0HYM2': 'HY' } factors.rename(columns=newnames, inplace=True) # Get last valid RF date; returns will be constrained to this date factors = factors[:factors['RF'].last_valid_index()] # Subtract RF for long-only factors subtract = ['HY', 'PUT', 'BXM', 'RXM'] for i in subtract: factors.loc[:, i] = factors[i] - factors['RF'] return factors
import pandas as pd import pandas.tseries.offsets as offsets week_ends = pd.date_range(start='01/02/2017', end='05/01/2017', freq='W') print(week_ends) print(week_ends[16] + offsets.Week(1)) exit(0) month_ends = pd.date_range(start='01/01/2016', end='05/01/2017', freq='M') print(month_ends[15].replace(day=22)) print(month_ends[15]) if month_ends[15].month == 4 and month_ends[15].year == 2017: month_ends[15].replace(day=22) print(month_ends[15]) print(month_ends) exit(0) for month_end in month_ends: quarter_start = month_end - offsets.MonthBegin(3) next_month_start = month_end + offsets.MonthBegin(1) next_month_end = month_end + offsets.MonthEnd(1) year_start = month_end - offsets.MonthBegin(12) print("-"*30) print(quarter_start) print(next_month_start) print(next_month_end) print(year_start)