shareclass = [] daily = [] bins = {k:{} for k in ['effective', 'realized', 'impact', 'quoted', 'volume', 'offersize', 'bidsize', 'ret', 'retq', 'counts']} tic = time.time() intervals = [(v,'s') for v in [1,2,5,15,30]] + [(v,'m') for v in [1,2,5]] dates = [20191007, 20191008, 20180305, 20180306] for d, date in enumerate(dates): master, trades, quotes = opentaq(date, taqdir) # screen on CRSP universe, and drop duplicate share classes (same permco) univ = crsp.get_universe(date)\ .join(crsp.get_section(dataset='names', fields=['ncusip', 'permco', 'exchcd'], date_field='date', date=date, start=0), how='inner')\ .sort_values(['permco', 'ncusip']) dups = master['CUSIP'].str.slice(0, 8).isin( univ.loc[univ.duplicated(['permco'], keep=False), 'ncusip']) shareclass.extend(master[dups].to_dict(orient='index').values()) univ = univ.sort_values(['permco','cap'], na_position='first')\ .drop_duplicates(['permco'], keep='last')\ .reset_index().set_index('ncusip', drop=False) # Iterate by symbol over Daily Taq trades, nbbo and master files for ct, cq, m in itertaq(trades, quotes, master, cusips=univ['ncusip'], open_t=_open, close_t=None): h = {'date':date} h.update(univ.loc[m['CUSIP'][:8], ['permno','decile','exchcd','siccd']])
node_color.update({k: 'cyan' for k in top_color}) pos = igraph_draw( g, num=1, center_name=center_name, node_color=node_color, node_size=node_size, edge_color='r', k=2, pos=None, font_size=10, figsize=(11,12), labels={k:v for k,v in zip(g.vs['name'], g.vs['bea'])}, title=f"Production Flows {list(total['year'].unique())}") plt.show() # Construct monthly BEA industry returns for the same period of years codes = Sectoring(sql, f"bea{vintage}", fillna='') naics = pstat.build_lookup('lpermno', 'naics', fillna=0) caps, counts, rets = [], [], [] for year in years: date = bd.endyr(year - 1) univ = crsp.get_universe(date) univ['bea'] = codes[naics(univ.index, date)] univ = univ[univ['bea'].ne('')] grouped = univ.groupby('bea') caps.append(grouped['cap'].sum().rename(year)) counts.append(grouped['cap'].count().rename(year)) months = bd.date_range(date, bd.endyr(year), 'endmo') for rebaldate, end in zip(months[:-1], months[1:]): r = pd.concat([crsp.get_ret(bd.begmo(end), end), crsp.get_cap(rebaldate, use_permco=False), univ['bea']], axis=1, join='inner').dropna() grp = r.groupby('bea') # industry ret is sum of weighted rets r['wtdret'] = r['ret'].mul(r['cap'].div(grp['cap'].transform('sum'))) rets.append(grp['wtdret'].sum(min_count=1).rename(end)) print(end, len(r), r['wtdret'].sum() / len(grp))
# Construct weekly reversal rebalbeg = 19730629 # increased stocks coverage in CRSP from around this date rebalend = 20210101 # a Friday, so can include last week in 2020 wd = Weekly(sql, 'Fri', rebalbeg, rebalend) # Generate Friday-end weekly cal # Retrieve weekly returns, standardize scores, and compute returns and i.c. june_universe = 0 # to track when reached a June end to update universe year = 0 # to track new year to retrieve prices in batch for screening res = DataFrame() tic = time.time() for rebaldate in wd.date_range(rebalbeg, rebalend)[:-1]: d = bd.june_universe(rebaldate) if d != june_universe: june_universe = d # update universe every June univ = crsp.get_universe(june_universe) # usual CRSP universe screen univ = univ[univ['decile'] < 10] # drop smalest decile stocks start = wd.begwk(rebaldate) # starting date of rebalance week beg = bd.offset(rebaldate, 1) # beginning date of holding week end = wd.endwk(beg) # ending date of holding week prcdate = bd.offset(start, -1) # require price available at start of week prcyear = (prcdate // 10000) * 10000 if prcyear != year: # retrieve new batch of prices each new year year = prcyear prc = crsp.get_range('daily', 'prc', 'date', year + 101, year + 1231, use_cache=True)
ax.set_title(f"Top Bigrams: {ticker} {year} 10-K Business Description") #plt.savefig(os.path.join(imgdir, f"{ticker}{year}.jpg")) plt.show() # Community Detection with Business Descriptions # Load spacy vocab lang = 'en_core_web_lg' nlp = spacy.load(lang) n_vocab, vocab_dim = nlp.vocab.vectors.shape print('Language:', lang, ' vocab:', n_vocab, ' dim:', vocab_dim) stopwords = {'company', 'companys', 'companies', 'product', 'products', 'service', 'services', 'business', 'description', 'year', 'years'} # Load stock universes univs = {y: crsp.get_universe(bd.endmo(int(f"{y-1}1231"))).assign(year=y) for y in range(1993, 2021)} ## Extract lemmatized nouns and named entities from bus10K documents A = ed.open(form=form, item=item) # open bus10K archive A['year'] = [d.year-(d.month<4) for d in int2date(A['date'])] # set fiscal year tic = time.time() for year in [2020, 2019, 2018, 2017]: docs = dict() ners = dict() for i, permno in tqdm(enumerate(sorted(univs[year].index))): doc = A[A['permno'].eq(permno) & A['year'].eq(year)].sort_values('date') if len(doc): sent = ed[doc.iloc[0]['pathname']].encode('ascii', 'ignore').lower()
f.write(' <br>') f.write('<p>\n') # Momentum and divyld from CRSP monthly if 'monthly' in testable: if regenerate: beg, end = 19251231, LAST_DATE intervals = {'mom12m': (2,12), 'mom36m': (13,36), 'mom6m': (2,6), 'mom1m': (1,1)} for label, past in intervals.items(): out = DataFrame() for rebaldate in bd.date_range(bd.endmo(beg, past[1]), end, 'endmo'): start = bd.endmo(rebaldate, -past[1]) beg1 = bd.offset(start, 1) end1 = bd.endmo(rebaldate, 1-past[0]) df = crsp.get_universe(end1) df['start'] = crsp.get_section(dataset='monthly', fields=['ret'], date_field='date', date=start)\ .reindex(df.index) df[label] = crsp.get_ret(beg1, end1).reindex(df.index) df['permno'] = df.index df['rebaldate'] = rebaldate df = df.dropna(subset=['start']) out = out.append(df[['rebaldate', 'permno', label]], ignore_index=True) # append rows n = signals.write(out, label, overwrite=True) beg, end = 19270101, LAST_DATE columns = ['chmom', 'divyld', 'indmom'] out = DataFrame() for rebaldate in bd.date_range(beg, end, 'endmo'):
LAST_DATE, window=12, months=[6], rebals=rebals)['holdings'] # Compute MOM momentum factor label = 'mom' past = (2, 12) df = [] # collect each month's momentum signal values rebalend = bd.endmo(LAST_DATE, -1) for rebaldate in bd.date_range(rebalbeg, rebalend, 'endmo'): beg = bd.endmo(rebaldate, -past[1]) # require price at this date start = bd.offset(beg, 1) # start date, inclusive, of signal end = bd.endmo(rebaldate, 1 - past[0]) # end date of signal p = [ crsp.get_universe(rebaldate), # retrieve prices and construct signal crsp.get_ret(start, end)['ret'].rename(label), crsp.get_section('monthly', ['prc'], 'date', beg)['prc'].rename('beg'), crsp.get_section('monthly', ['prc'], 'date', end)['prc'].rename('end') ] q = pd.concat(p, axis=1, join='inner').reset_index().dropna() q['rebaldate'] = rebaldate df.append(q[['permno', 'rebaldate', label]]) print(rebaldate, len(df), len(q)) df = pd.concat(df) signals = chunk_signal(df) holdings[label] = famafrench_sorts(crsp, label, signals, rebalbeg, rebalend,
wordlists = Unstructured(mongodb, 'WordLists') sentiments = {k: wordlists['lm', k] for k in ['positive', 'negative']} # Pre-process with sklearn methods tf_vectorizer = sklearn.feature_extraction.text.CountVectorizer( strip_accents='unicode', lowercase=True, #stop_words=stop_words, # tokenizer=CustomTokenizer(), token_pattern=r"\b[^\d\W][^\d\W][^\d\W]+\b") tokenize = tf_vectorizer.build_tokenizer() analyze = tf_vectorizer.build_analyzer() # Construct sentiment feature all years for usual universe univs = { yr + 1: crsp.get_universe(bd.endmo(yr * 10000 + 1231)).assign(year=yr + 1) for yr in range(1992, 2020) } results = [] files = ed.open(form=form, item=item) # open mda10K archive tic = time.time() permnos = files['permno'].unique().astype(int) for i, permno in tqdm(enumerate(permnos)): # Loop over all permnos # retrieve all valid mda's for this permno by year mdas = {} dates = {} for _, f in files[files['permno'].eq(permno)].iterrows(): year = int(f['date']) // 10000 if ((f['date'] // 100) % 100) <= 3: # if filing date <= Mar year = year - 1 # then assign to previous year