if 'new' in testable: with open(os.path.join(outdir, 'index.html'), 'wt') as f: f.write('<h1>Quant Factors Zoo</h1><br>') f.write(' <br>') f.write('<p>\n') # Momentum and divyld from CRSP monthly if 'monthly' in testable: if regenerate: beg, end = 19251231, LAST_DATE intervals = {'mom12m': (2,12), 'mom36m': (13,36), 'mom6m': (2,6), 'mom1m': (1,1)} for label, past in intervals.items(): out = DataFrame() for rebaldate in bd.date_range(bd.endmo(beg, past[1]), end, 'endmo'): start = bd.endmo(rebaldate, -past[1]) beg1 = bd.offset(start, 1) end1 = bd.endmo(rebaldate, 1-past[0]) df = crsp.get_universe(end1) df['start'] = crsp.get_section(dataset='monthly', fields=['ret'], date_field='date', date=start)\ .reindex(df.index) df[label] = crsp.get_ret(beg1, end1).reindex(df.index) df['permno'] = df.index df['rebaldate'] = rebaldate df = df.dropna(subset=['start']) out = out.append(df[['rebaldate', 'permno', label]], ignore_index=True) # append rows n = signals.write(out, label, overwrite=True)
ax.set_title(f"Top Bigrams: {ticker} {year} 10-K Business Description") #plt.savefig(os.path.join(imgdir, f"{ticker}{year}.jpg")) plt.show() # Community Detection with Business Descriptions # Load spacy vocab lang = 'en_core_web_lg' nlp = spacy.load(lang) n_vocab, vocab_dim = nlp.vocab.vectors.shape print('Language:', lang, ' vocab:', n_vocab, ' dim:', vocab_dim) stopwords = {'company', 'companys', 'companies', 'product', 'products', 'service', 'services', 'business', 'description', 'year', 'years'} # Load stock universes univs = {y: crsp.get_universe(bd.endmo(int(f"{y-1}1231"))).assign(year=y) for y in range(1993, 2021)} ## Extract lemmatized nouns and named entities from bus10K documents A = ed.open(form=form, item=item) # open bus10K archive A['year'] = [d.year-(d.month<4) for d in int2date(A['date'])] # set fiscal year tic = time.time() for year in [2020, 2019, 2018, 2017]: docs = dict() ners = dict() for i, permno in tqdm(enumerate(sorted(univs[year].index))): doc = A[A['permno'].eq(permno) & A['year'].eq(year)].sort_values('date') if len(doc): sent = ed[doc.iloc[0]['pathname']].encode('ascii', 'ignore').lower()
## compute HML portfolio holdings signals = chunk_signal(df) holdings = famafrench_sorts(crsp, 'hml', signals, rebalbeg, LAST_DATE, window=12, months=[6], rebals=rebals)['holdings'] # Compute MOM momentum factor label = 'mom' past = (2, 12) df = [] # collect each month's momentum signal values rebalend = bd.endmo(LAST_DATE, -1) for rebaldate in bd.date_range(rebalbeg, rebalend, 'endmo'): beg = bd.endmo(rebaldate, -past[1]) # require price at this date start = bd.offset(beg, 1) # start date, inclusive, of signal end = bd.endmo(rebaldate, 1 - past[0]) # end date of signal p = [ crsp.get_universe(rebaldate), # retrieve prices and construct signal crsp.get_ret(start, end)['ret'].rename(label), crsp.get_section('monthly', ['prc'], 'date', beg)['prc'].rename('beg'), crsp.get_section('monthly', ['prc'], 'date', end)['prc'].rename('end') ] q = pd.concat(p, axis=1, join='inner').reset_index().dropna() q['rebaldate'] = rebaldate df.append(q[['permno', 'rebaldate', label]]) print(rebaldate, len(df), len(q)) df = pd.concat(df)
& (facpr['close'] - facpr['prc'].abs()).abs().ge(0.009) & (facpr['prc'].gt(0) | (facpr['close'] - facpr['prc'].abs()).abs().gt(0.25))] # adjust shrout if necessary, then merge into big df of prices facpr['shrout'] *= (1 + facpr['facpr']) facpr = facpr.set_index('permno') price.loc[facpr.index, 'shrout'] = facpr.loc[facpr.index, 'shrout'] prices = prices.join(price[['shrout']], on='permno', how='inner') # create monthly data df prices['month'] = prices['date'] // 100 groupby = prices.groupby(['permno', 'month']) monthly = (groupby[['ret', 'retx']].prod() - 1)\ .join(groupby['close'].last()).reset_index() monthly['month'] = bd.endmo(monthly['month']) monthly.rename(columns={'month': 'date', 'close': 'prc'}, inplace=True) monthly = monthly[monthly['date'] > date] print('monthly:', len(monthly), len(np.unique(monthly['permno']))) # clean up prices table to mimic daily table prices['ret'] -= 1 prices['retx'] -= 1 prices.rename(columns={ 'open': 'openprc', 'high': 'askhi', 'low': 'bidlo', 'close': 'prc', 'volume': 'vol' }, inplace=True)
print( ls.get_robustcov_results('hac-panel', groups=rets['port'], maxlags=3).summary()) print(ls.get_robustcov_results('cluster', groups=rets['port']).summary()) ## Fama MacBeth with individual stocks and standardized scores as loadings rebalbeg = 19640601 rebalend = LAST_DATE rebaldates = crsp.bd.date_range(rebalbeg, rebalend, 'endmo') loadings = dict() for pordate in rebaldates: # retrieve signal values every month date = bd.june_universe(pordate) univ = crsp.get_universe(date) cap = np.sqrt(crsp.get_cap(date)['cap']) smb = -np.log(cap).rename('size') hml = signals('hml', date, bd.endmo(date, -12))['hml'].rename('value') beta = (signals('beta', pordate, bd.begmo(pordate))['beta'] * 2 / 3) + (1 / 3) mom = signals('mom', pordate)['mom'].rename('momentum') df = pd.concat( (beta, hml, smb, mom), # inner join of signals with univ join='inner', axis=1).reindex(univ.index).dropna() loadings[pordate] = winsorized(df, quantiles=[0.05, 0.95]) ## Compute coefficients from FM cross-sectional regressions riskpremium = RiskPremium(user, bench, 'RF', LAST_DATE) riskpremium( crsp, loadings, # FM regressions on standardized scores weights=None,
wordlists = Unstructured(mongodb, 'WordLists') sentiments = {k: wordlists['lm', k] for k in ['positive', 'negative']} # Pre-process with sklearn methods tf_vectorizer = sklearn.feature_extraction.text.CountVectorizer( strip_accents='unicode', lowercase=True, #stop_words=stop_words, # tokenizer=CustomTokenizer(), token_pattern=r"\b[^\d\W][^\d\W][^\d\W]+\b") tokenize = tf_vectorizer.build_tokenizer() analyze = tf_vectorizer.build_analyzer() # Construct sentiment feature all years for usual universe univs = { yr + 1: crsp.get_universe(bd.endmo(yr * 10000 + 1231)).assign(year=yr + 1) for yr in range(1992, 2020) } results = [] files = ed.open(form=form, item=item) # open mda10K archive tic = time.time() permnos = files['permno'].unique().astype(int) for i, permno in tqdm(enumerate(permnos)): # Loop over all permnos # retrieve all valid mda's for this permno by year mdas = {} dates = {} for _, f in files[files['permno'].eq(permno)].iterrows(): year = int(f['date']) // 10000 if ((f['date'] // 100) % 100) <= 3: # if filing date <= Mar year = year - 1 # then assign to previous year
plot_ff(y, label, corr=np.corrcoef(backtest.excess, rowvar=False)[0, 1], num=2, logdir=logdir) ## Construct Mom # Load monthly universe and stock returns from CRSP. # Signal is stocks' total return from 12 months ago, skipping most recent month # Construct 2-way portfolio sorts, and backtest returns label, benchname, past, leverage = 'mom', 'Mom(mo)', (2, 12), 1 rebalbeg, rebalend = 19260101, LAST_DATE df = [] # collect each month's momentum signal values for rebaldate in bd.date_range(rebalbeg, rebalend, 'endmo'): beg = bd.endmo(rebaldate, -past[1]) # require price at this date start = bd.offset(beg, 1) # start date, inclusive, of signal end = bd.endmo(rebaldate, 1 - past[0]) # end date of signal p = [ crsp.get_universe(rebaldate), # retrieve prices and construct signal crsp.get_ret(start, end)['ret'].rename(label), crsp.get_section('monthly', ['prc'], 'date', beg)['prc'].rename('beg'), crsp.get_section('monthly', ['prc'], 'date', end)['prc'].rename('end') ] q = pd.concat(p, axis=1, join='inner').reset_index().dropna() q['rebaldate'] = rebaldate df.append(q[['permno', 'rebaldate', label]]) print(rebaldate, len(df), len(q)) df = pd.concat(df) signals.write(df, label, overwrite=True) portfolios = famafrench_sorts(crsp,