def parse_items(years=years): ed = EdgarClone(config.datapath['10X'], zipped=False) sql = SQL(**config.credentials['sql']) bday = BusDay(sql) pstat = PSTAT(sql, bday) to_permno = pstat.build_lookup(target='lpermno', source='cik', fillna=0) items = {'10-K': ['bus10K', 'mda10K']} # '10-Q': ['mda10Q']} logger = [] for year in years: #2019, 2021): # Start 1998++ rows = ed.open(date=year) row = rows.iloc[0] for i, row in rows.iterrows(): permno = to_permno(int(row['cik'])) if row['form'] in items and permno: filing = ed[row['pathname']] for item in items[row['form']]: extract = Edgar.extract_item(filing, item) s = ed.to_path(form=row['form'], permno=permno, item=item, basename=os.path.basename(row['pathname'])) with open(s, 'wt') as g: g.write(extract) r = {'year': year, 'permno': permno, 'item': item, 'text_c': len(filing), 'item_c': len(extract), 'text_w': len(filing.split()), 'item_w': len(extract.split())} logger.append(r) print(", ".join([f"{k}: {v}" for k,v in r.items()])) logger = DataFrame.from_records(logger)
import igraph # pip3 install cairocffi import rpy2.robjects as ro from rpy2.robjects.packages import importr from finds.pyR import PyR from finds.busday import BusDay from finds.database import SQL, Redis from finds.structured import CRSP, PSTAT from finds.sectors import Sectoring, BEA from finds.graph import igraph_draw from settings import settings ECHO = True sql = SQL(**settings['sql']) bd = BusDay(sql) rdb = Redis(**settings['redis']) crsp = CRSP(sql, bd, rdb) pstat = PSTAT(sql, bd) bea = BEA(rdb, **settings['bea'], echo=ECHO) logdir = None # os.path.join(settings['images'], 'bea') years = np.arange(1947, 2020) vintages = [1997, 1963, 1947] # when sectoring schemes were revised # Read IOUse tables from BEA website ioUses = dict() for vintage in vintages: for year in [y for y in years if y >= vintage]: df = bea.read_ioUse(year, vintage=vintage) ioUses[(vintage, year)] = df print(f"{len(ioUses)} tables through sectoring vintage year {vintage}") # Smooth average flows over several years, for social relations regression tgt = 'colcode' # head of edge is user industry (table column)
from pandas import DataFrame, Series import matplotlib.pyplot as plt import seaborn as sns import igraph # pip3 install cairocffi from igraph import Graph from itertools import chain from finds.graph import igraph_info, igraph_community from finds.database import SQL from finds.busday import BusDay from finds.structured import PSTAT from finds.sectors import Sectoring from settings import settings sql = SQL(**settings['sql']) bd = BusDay(sql) pstat = PSTAT(sql, bd) logdir = os.path.join(settings['images'], 'tnic') # None tnic_scheme = 'tnic2' # Retrieve TNIC scheme from Hoberg and Phillips website # https://hobergphillips.tuck.dartmouth.edu/industryclass.htm root = 'https://hobergphillips.tuck.dartmouth.edu/idata/' source = os.path.join(root, tnic_scheme + '_data.zip') if source.startswith('http'): response = requests.get(source) source = io.BytesIO(response.content) with zipfile.ZipFile(source).open(tnic_scheme + "_data.txt") as f: tnic_data = pd.read_csv(f, sep='\s+') tnic_data.head() years = [1989, 1999, 2009, 2019] # [1999, 2019]:
import matplotlib.pyplot as plt import os import time from finds.database import SQL from finds.busday import BusDay from finds.structured import PSTAT, CRSP, Benchmarks from finds.backtesting import EventStudy from settings import settings LAST_DATE = settings['crsp_date'] ECHO = True sql = SQL(**settings['sql'], echo=ECHO) user = SQL(**settings['user'], echo=ECHO) bd = BusDay(sql) keydev = PSTAT(sql, bd) crsp = CRSP(sql, bd, rdb=None) bench = Benchmarks(sql, bd) eventstudy = EventStudy(user, bench, LAST_DATE) outdir = os.path.join(settings['images'], 'events') # event window parameters end = 20201201 beg = 19890101 # 20020101 minobs = 250 left, right, post = -1, 1, 21 # str formatter to pretty print event and role description given their id's eventformat = lambda e, r: "{event} ({eventid}) {role} [{roleid}]".format( event=keydev.event_[e], eventid=e, role=keydev.role_[r], roleid=r) events = sorted(keydev.event_.keys()) # list of eventid's
if False: import os from finds.sectors import Sectoring, BEA from finds.database import SQL, Redis from finds.structured import PSTAT, CRSP from finds.busday import BusDay from settings import settings downloads = settings['remote'] sql = SQL(**settings['sql']) bd = BusDay(sql) rdb = Redis(**settings['redis']) # None crsp = CRSP(sql, bd, rdb) pstat = PSTAT(sql, bd) bea = BEA(rdb, **settings['bea']) scheme = 'sic2' codes = Sectoring(sql, scheme) scheme = 'sic3' codes = Sectoring(sql, scheme) for scheme in [5, 10, 12, 17, 30, 38, 48, 49]: codes = Sectoring(sql, f"codes{scheme}") codes.load(source=None) scheme = 'SIC' # SIC from NAICS Crosswalk codes = Sectoring(sql, scheme) codes.load(source=None)
permnos = [] for i, (key, doc) in tqdm(enumerate(iterable(docs, unique=False, key=True))): permnos.append(int(key)) vector[i] = nlp(doc).vector print(time.time() - tic) # compute cosine similarity matrixes between documents similar[year]['cosine'] = cosine_similarity(tfidf, tfidf) #cos_vector[year] = cosine_similarity(vector, vector) #corr_tfidf[year] = np.corrcoef(tfidf.toarray(), rowvar=True) terms = (tfidf > 0).todense() similar[year]['jaccard'] = 1- pairwise_distances(terms,metric="jaccard") ## Populate new DataFrame, indexed by permnos, with sic and naics codes pstat = PSTAT(sql, bd) vs = DataFrame(index=permnos) for code in ['sic', 'naics']: lookup = pstat.build_lookup('lpermno', code, fillna=0) vs[code] = lookup(vs.index) naics = Sectoring(sql, 'naics', fillna=0) # supplement from crosswalk sic = Sectoring(sql, 'sic', fillna=0) vs['naics'] = vs['naics'].where(vs['naics'] > 0, naics[vs['sic']]) vs['sic'] = vs['sic'].where(vs['sic'] > 0, naics[vs['naics']]) Series(np.sum(vs > 0, axis=0)).rename('Non-missing').to_frame().T ## Use sectoring scheme scheme = 'codes49' codes = {scheme: Sectoring(sql, scheme, fillna=0)} vs[scheme] = codes[scheme][vs['sic']] vs = vs[vs[scheme].ne(codes[scheme].fillna)]
import os, time from datetime import datetime from settings import settings from finds.database import SQL, Redis from finds.structured import PSTAT, CRSP, IBES, Benchmarks, Signals from finds.busday import BusDay, Weekly from finds.structured import as_signal from finds.backtesting import BackTest from finds.solve import fractiles LAST_DATE = settings['crsp_date'] sql = SQL(**settings['sql']) user = SQL(**settings['user']) rdb = Redis(**settings['redis']) bd = BusDay(sql) pstat = PSTAT(sql, bd) crsp = CRSP(sql, bd, rdb) ibes = IBES(sql, bd) bench = Benchmarks(sql, bd) signals = Signals(user) backtest = BackTest(user, bench, 'RF', LAST_DATE) outdir = os.path.join(settings['images'], 'factors') # signals to flip signs when forming spread portfolios flips = {'mom1m':-1, 'mom36m':-1, 'pricedelay':-1, 'absacc':-1, 'acc':-1, 'agr':-1, 'chcsho':-1, 'egr':-1, 'mve_ia':-1, 'pctacc':-1, 'aeavol':-1, 'disp':-1, 'stdacc':-1, 'stdcf':-1, 'secured':-1, 'maxret':-1, 'ill':-1, 'zerotrade':-1, 'cashpr':-1, 'chinv':-1, 'invest':-1, 'cinvest':-1} ## Helpers to lag characteristics and roll returns def as_lags(df, var, key, nlags):
siccd = df['siccd'] df = df.iloc[:, 5:].dropna().T lb, ub = np.nanpercentile(df.std(axis=0), [1, 99]) X = df.loc[:, df.std(axis=0).between(lb, ub)] siccd = siccd[X.columns] print('samples/time(T) x features/stocks(N)', X.shape) # Sectoring on sic or naics per FamaFrench 12-group scheme from finds.database import SQL from finds.busday import BusDay from finds.structured import PSTAT from finds.sectors import Sectoring sql = SQL(**settings['sql']) user = SQL(**settings['user']) bd = BusDay(sql) pstat = PSTAT(sql, bd) sic = pstat.build_lookup('lpermno', 'sic', fillna=0) # to lookup sic code naics = pstat.build_lookup('lpermno', 'naics', fillna=0) # to lookup naics code naics2sic = Sectoring(sql, 'sic', fillna=0) # cross-walk out = Series(data=sic[X.columns], index=X.columns, name='sector') out[out.isnull()] = naics2sic[naics[out.index[out.isnull()]]] out[out.isnull()] = siccd[out.isnull()] codes = Sectoring(sql, 'codes12', fillna=0) # use FF's 12-group sectoring sectors = codes[out] # apply sic to 12-sector map sector_index, sector_label = pd.factorize(sectors) sector_onehot = np.eye(len(sector_label))[sector_index] Series(sectors).value_counts().rename('counts').to_frame().T # PCA of returns covariances by SVD # SVD: u S vT = x (T samples x N stocks)
index_formatter=bd.offset) for col in df.columns: bench.load_series(df[col], name=name, item=item) print(DataFrame(**sql.run('select * from ' + bench['ident'].key))) """ """Weekly: price update and clear affected redis store run yahoo ./redis-cli --scan --pattern '*CRSP_2021*' | xargs ./redis-cli del """ # Estimate daily factors LAST_DATE = 20210618 # last date in daily prices table bd = BusDay(sql) bench = Benchmarks(sql, bd) crsp = CRSP(sql, bd, rdb) pstat = PSTAT(sql, bd) ## Rebalance and return dates, and initialize classes for calculations rebalbeg = bd.offset(20190630) rebals = [bd.offset(d) for d in [20200630]] stocks = chunk_stocks(crsp, rebalbeg, LAST_DATE) perf = DailyPerformance(stocks) # Compute HML factor label = 'hml' lag = 6 # number of months to lag fundamental data df = pstat.get_linked( # retrieve required fields from compustat dataset='annual', date_field='datadate', fields=['seq', 'pstk', 'pstkrv', 'pstkl', 'txditc'], where=(f"indfmt = 'INDL' AND datafmt = 'STD' AND curcd = 'USD' "