def parse_items(years=years): ed = EdgarClone(config.datapath['10X'], zipped=False) sql = SQL(**config.credentials['sql']) bday = BusDay(sql) pstat = PSTAT(sql, bday) to_permno = pstat.build_lookup(target='lpermno', source='cik', fillna=0) items = {'10-K': ['bus10K', 'mda10K']} # '10-Q': ['mda10Q']} logger = [] for year in years: #2019, 2021): # Start 1998++ rows = ed.open(date=year) row = rows.iloc[0] for i, row in rows.iterrows(): permno = to_permno(int(row['cik'])) if row['form'] in items and permno: filing = ed[row['pathname']] for item in items[row['form']]: extract = Edgar.extract_item(filing, item) s = ed.to_path(form=row['form'], permno=permno, item=item, basename=os.path.basename(row['pathname'])) with open(s, 'wt') as g: g.write(extract) r = {'year': year, 'permno': permno, 'item': item, 'text_c': len(filing), 'item_c': len(extract), 'text_w': len(filing.split()), 'item_w': len(extract.split())} logger.append(r) print(", ".join([f"{k}: {v}" for k,v in r.items()])) logger = DataFrame.from_records(logger)
import pandas as pd from pandas import DataFrame, Series import matplotlib.pyplot as plt import igraph # pip3 install cairocffi import rpy2.robjects as ro from rpy2.robjects.packages import importr from finds.pyR import PyR from finds.busday import BusDay from finds.database import SQL, Redis from finds.structured import CRSP, PSTAT from finds.sectors import Sectoring, BEA from finds.graph import igraph_draw from settings import settings ECHO = True sql = SQL(**settings['sql']) bd = BusDay(sql) rdb = Redis(**settings['redis']) crsp = CRSP(sql, bd, rdb) pstat = PSTAT(sql, bd) bea = BEA(rdb, **settings['bea'], echo=ECHO) logdir = None # os.path.join(settings['images'], 'bea') years = np.arange(1947, 2020) vintages = [1997, 1963, 1947] # when sectoring schemes were revised # Read IOUse tables from BEA website ioUses = dict() for vintage in vintages: for year in [y for y in years if y >= vintage]: df = bea.read_ioUse(year, vintage=vintage) ioUses[(vintage, year)] = df print(f"{len(ioUses)} tables through sectoring vintage year {vintage}")
import pandas as pd from pandas import DataFrame, Series import matplotlib.pyplot as plt import seaborn as sns import igraph # pip3 install cairocffi from igraph import Graph from itertools import chain from finds.graph import igraph_info, igraph_community from finds.database import SQL from finds.busday import BusDay from finds.structured import PSTAT from finds.sectors import Sectoring from settings import settings sql = SQL(**settings['sql']) bd = BusDay(sql) pstat = PSTAT(sql, bd) logdir = os.path.join(settings['images'], 'tnic') # None tnic_scheme = 'tnic2' # Retrieve TNIC scheme from Hoberg and Phillips website # https://hobergphillips.tuck.dartmouth.edu/industryclass.htm root = 'https://hobergphillips.tuck.dartmouth.edu/idata/' source = os.path.join(root, tnic_scheme + '_data.zip') if source.startswith('http'): response = requests.get(source) source = io.BytesIO(response.content) with zipfile.ZipFile(source).open(tnic_scheme + "_data.txt") as f: tnic_data = pd.read_csv(f, sep='\s+') tnic_data.head()
import time import os from pandas import DataFrame, Series from matplotlib import colors import matplotlib.pyplot as plt from sklearn.decomposition import PCA from finds.database import SQL, Redis from finds.structured import CRSP from finds.busday import BusDay from finds.taq import opentaq, itertaq, open_t, close_t, bin_trades, bin_quotes from finds.display import plot_time, row_formatted from finds.solve import weighted_average from settings import settings sql = SQL(**settings['sql']) user = SQL(**settings['user']) bday = BusDay(sql) rdb = Redis(**settings['redis']) crsp = CRSP(sql, bday, rdb=rdb) logdir = os.path.join(settings['images'], 'micro') # None taqdir = os.path.join(settings['remote'], 'TAQ') _open = pd.to_datetime('1900-01-01T9:30') # exclude <= _close = pd.to_datetime('1900-01-01T16:00') # exclude > # Loop through the sample TAQ data dates available from NYSE and collect info shareclass = [] daily = [] bins = {k:{} for k in ['effective', 'realized', 'impact', 'quoted', 'volume', 'offersize', 'bidsize', 'ret', 'retq', 'counts']} tic = time.time() intervals = [(v,'s') for v in [1,2,5,15,30]] + [(v,'m') for v in [1,2,5]]
License: MIT """ import pandas as pd from pandas import DataFrame, Series import numpy as np import matplotlib.pyplot as plt import os import time from finds.database import SQL, Redis from finds.structured import CRSP from finds.busday import BusDay, Weekly from settings import settings sql = SQL(**settings['sql']) user = SQL(**settings['user']) rdb = Redis(**settings['redis']) bd = BusDay(sql) crsp = CRSP(sql, bd, rdb) logdir = os.path.join(settings['images'], 'weekrev') # Construct weekly reversal rebalbeg = 19730629 # increased stocks coverage in CRSP from around this date rebalend = 20210101 # a Friday, so can include last week in 2020 wd = Weekly(sql, 'Fri', rebalbeg, rebalend) # Generate Friday-end weekly cal # Retrieve weekly returns, standardize scores, and compute returns and i.c. june_universe = 0 # to track when reached a June end to update universe year = 0 # to track new year to retrieve prices in batch for screening res = DataFrame() tic = time.time() for rebaldate in wd.date_range(rebalbeg, rebalend)[:-1]: d = bd.june_universe(rebaldate)
from sklearn.feature_extraction import text from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances from tqdm import tqdm from collections import Counter from finds.database import SQL, MongoDB, Redis from finds.structured import CRSP, PSTAT from finds.busday import BusDay, int2date from finds.unstructured import Unstructured from finds.edgar import EdgarClone, Edgar from finds.graph import igraph_info, igraph_community from finds.sectors import Sectoring from settings import settings ECHO = True sql = SQL(**settings['sql']) user = SQL(**settings['user']) bd = BusDay(sql) rdb = Redis(**settings['redis']) crsp = CRSP(sql, bd, rdb) mongodb = MongoDB(**settings['mongodb']) wordlists = Unstructured(mongodb, 'WordLists') ed = EdgarClone(settings['10X'], zipped=True, echo=ECHO) imgdir = os.path.join(settings['images'], 'edgar') item, form = 'bus10K', '10-K' # Retrieve business description (10-K item 1) for 'aapl' from Edgar from nltk.tokenize import RegexpTokenizer ticker = 'AAPL' cik = Edgar.fetch_tickers()[ticker.lower()] # lookup aapl's cik stop_words = [w for c in ['genericlong', 'DatesandNumbers'] # LM stop word lists for w in wordlists['lm', c.lower()]] # if "'" not in w] top_words = {}
def endwk(self, date, weeks=0): """Return ending business week date/s""" return _to_values(self.weeks['end'].iloc[self.numwk(date) + weeks]) def ismonthend(self, date): """If date/s in last complete week in any month""" return _to_values(self.weeks['ismonthend'].iloc[self.numwk(date)]) if False: # create custom busday trading dates from settings import settings from finds.database import SQL from finds.busday import BusDay sql = SQL(**settings['sql'], echo=True) busday = BusDay(sql, create=True) # set create flag as True if False: # some unit tests from settings import settings from finds.database import SQL from finds.busday import Weekly sql = SQL(**settings['sql'], echo=True) wd = Weekly(sql, day=3, end=20201231) # derive weekly trading calendar print(wd.numwk(20201230)) print(wd.numwk(20210130)) print(wd.numwk(20201231)) print(wd.endwk([20201209, 20201219])) print(wd.endwk(20201209)) print(wd.endmo([20201209, 20201219])) print(wd.endmo(20201209))
import matplotlib.pyplot as plt import os, time from datetime import datetime from settings import settings from finds.database import SQL, Redis from finds.structured import PSTAT, CRSP, IBES, Benchmarks, Signals from finds.busday import BusDay, Weekly from finds.structured import as_signal from finds.backtesting import BackTest from finds.solve import fractiles LAST_DATE = settings['crsp_date'] sql = SQL(**settings['sql']) user = SQL(**settings['user']) rdb = Redis(**settings['redis']) bd = BusDay(sql) pstat = PSTAT(sql, bd) crsp = CRSP(sql, bd, rdb) ibes = IBES(sql, bd) bench = Benchmarks(sql, bd) signals = Signals(user) backtest = BackTest(user, bench, 'RF', LAST_DATE) outdir = os.path.join(settings['images'], 'factors') # signals to flip signs when forming spread portfolios flips = {'mom1m':-1, 'mom36m':-1, 'pricedelay':-1, 'absacc':-1, 'acc':-1, 'agr':-1, 'chcsho':-1, 'egr':-1, 'mve_ia':-1, 'pctacc':-1, 'aeavol':-1, 'disp':-1, 'stdacc':-1, 'stdcf':-1, 'secured':-1, 'maxret':-1, 'ill':-1, 'zerotrade':-1, 'cashpr':-1, 'chinv':-1, 'invest':-1, 'cinvest':-1} ## Helpers to lag characteristics and roll returns
print("\n".join(f"[{i}] {d}" for i, d in enumerate(datasets))) for name, item, suffix in datasets: df = fetch_FamaFrench(name=name, item=item, suffix=suffix, index_formatter=bd.offset) for col in df.columns: bench.load_series(df[col], name=name, item=item) print(DataFrame(**sql.run('select * from ' + bench['ident'].key))) """ """Weekly: price update and clear affected redis store run yahoo ./redis-cli --scan --pattern '*CRSP_2021*' | xargs ./redis-cli del """ # Estimate daily factors LAST_DATE = 20210618 # last date in daily prices table bd = BusDay(sql) bench = Benchmarks(sql, bd) crsp = CRSP(sql, bd, rdb) pstat = PSTAT(sql, bd) ## Rebalance and return dates, and initialize classes for calculations rebalbeg = bd.offset(20190630) rebals = [bd.offset(d) for d in [20200630]] stocks = chunk_stocks(crsp, rebalbeg, LAST_DATE) perf = DailyPerformance(stocks) # Compute HML factor label = 'hml' lag = 6 # number of months to lag fundamental data df = pstat.get_linked( # retrieve required fields from compustat dataset='annual',
df = pd.read_csv(os.path.join(pathname, 'prices.csv.gz'), sep='|') new = set(np.unique(df['ticker'])).difference( set(np.unique(prices['ticker']))) df = df[df['ticker'].isin(new)] prices = prices.append(df, sort=False) print(pathname, 'added prices', new) df = pd.read_csv(os.path.join(pathname, 'dividends.csv.gz'), sep='|') new = set(np.unique(df['ticker'])).difference( set(np.unique(dividends['ticker']))) df = df[df['ticker'].isin(new)] dividends = dividends.append(df, sort=False) print(pathname, 'added dividends', new) sql = SQL(**config.credentials['sql'], echo=config.ECHO) bd = BusDay(sql) crsp = CRSP(sql, bd, rdb=None) date = bd.offset(crsp_date) # get price and shrout as of last date price = crsp.get_section('daily', ['prc', 'shrout'], 'date', date, start=None) # get tickers to lookup permno tickers = crsp.get_section('names', ['tsymbol', 'date'], 'date', date, start=0).reindex(price.index) tickers = tickers.sort_values(['tsymbol', 'date'])\
import matplotlib.pyplot as plt import statsmodels.formula.api as smf import pandas_datareader as pdr from pandas_datareader.data import DataReader from pandas_datareader.famafrench import FamaFrenchReader from finds.database import SQL, Redis from finds.structured import CRSP, Signals, Benchmarks from finds.busday import BusDay from finds.backtesting import RiskPremium from finds.solve import winsorized from settings import settings LAST_DATE = settings['crsp_date'] sql = SQL(**settings['sql']) user = SQL(**settings['user']) rdb = Redis(**settings['redis']) bd = BusDay(sql) crsp = CRSP(sql, bd, rdb) bench = Benchmarks(sql, bd) signals = Signals(user) logdir = os.path.join(settings['images'], 'fm') def least_squares(data=None, y=['y'], x=['x'], stdres=False): """Helper to compute least square coefs, supports groupby().apply""" X = data[x].to_numpy() Y = data[y].to_numpy() X = np.hstack([np.ones((X.shape[0], 1)), X]) x = ['Intercept'] + x b = np.dot(np.linalg.inv(np.dot(X.T, X)), np.dot(X.T, Y)).T if stdres: b = np.hstack([b, np.std(Y - (X @ b.T), axis=0).reshape(-1, 1)])
import sklearn.feature_extraction from sklearn.metrics.pairwise import cosine_similarity from nltk.tokenize import RegexpTokenizer from tqdm import tqdm from finds.database import SQL, MongoDB, Redis from finds.structured import CRSP, Signals from finds.busday import BusDay from finds.unstructured import Unstructured from finds.readers import fetch_lm from finds.solve import weighted_average, fractiles from finds.edgar import EdgarClone from settings import settings ECHO = False sql = SQL(**settings['sql']) user = SQL(**settings['user']) bd = BusDay(sql) rdb = Redis(**settings['redis']) crsp = CRSP(sql, bd, rdb) ed = EdgarClone(settings['10X'], zipped=True, echo=ECHO) signals = Signals(user) mongodb = MongoDB(**settings['mongodb']) wordlists = Unstructured(mongodb, 'WordLists') imgdir = os.path.join(settings['images'], 'edgar') item, form = 'mda10K', '10-K' # Load Loughran and MacDonald sentiment words and stopwords mongodb = MongoDB() wordlists = Unstructured(mongodb, 'WordLists') sentiments = {k: wordlists['lm', k] for k in ['positive', 'negative']} # Pre-process with sklearn methods
from pandas import DataFrame, Series import matplotlib.pyplot as plt import time from settings import settings import os from finds.database import SQL, Redis from finds.busday import BusDay from finds.structured import PSTAT, CRSP, Benchmarks, Signals from finds.backtesting import BackTest from finds.solve import fractiles LAST_DATE = settings['crsp_date'] sql = SQL(**settings['sql']) user = SQL(**settings['user']) rdb = Redis(**settings['redis']) bd = BusDay(sql) pstat = PSTAT(sql, bd) crsp = CRSP(sql, bd, rdb) bench = Benchmarks(sql, bd) signals = Signals(user) backtest = BackTest(user, bench, 'RF', LAST_DATE) logdir = os.path.join(settings['images'], 'ff') # None # Load items from Compustat Annual # Construct HML as shareholders equity plus investment tax credits, less # preferred stock divided by December market cap. # Require 6 month reporting lag and at least two years history in Compustat label = 'hml' lag = 6 # number of months to lag fundamental data df = pstat.get_linked( # retrieve required fields from compustat