from pandas import DataFrame, Series from matplotlib import colors import matplotlib.pyplot as plt from sklearn.decomposition import PCA from finds.database import SQL, Redis from finds.structured import CRSP from finds.busday import BusDay from finds.taq import opentaq, itertaq, open_t, close_t, bin_trades, bin_quotes from finds.display import plot_time, row_formatted from finds.solve import weighted_average from settings import settings sql = SQL(**settings['sql']) user = SQL(**settings['user']) bday = BusDay(sql) rdb = Redis(**settings['redis']) crsp = CRSP(sql, bday, rdb=rdb) logdir = os.path.join(settings['images'], 'micro') # None taqdir = os.path.join(settings['remote'], 'TAQ') _open = pd.to_datetime('1900-01-01T9:30') # exclude <= _close = pd.to_datetime('1900-01-01T16:00') # exclude > # Loop through the sample TAQ data dates available from NYSE and collect info shareclass = [] daily = [] bins = {k:{} for k in ['effective', 'realized', 'impact', 'quoted', 'volume', 'offersize', 'bidsize', 'ret', 'retq', 'counts']} tic = time.time() intervals = [(v,'s') for v in [1,2,5,15,30]] + [(v,'m') for v in [1,2,5]] dates = [20191007, 20191008, 20180305, 20180306] for d, date in enumerate(dates):
import matplotlib.pyplot as plt import igraph # pip3 install cairocffi import rpy2.robjects as ro from rpy2.robjects.packages import importr from finds.pyR import PyR from finds.busday import BusDay from finds.database import SQL, Redis from finds.structured import CRSP, PSTAT from finds.sectors import Sectoring, BEA from finds.graph import igraph_draw from settings import settings ECHO = True sql = SQL(**settings['sql']) bd = BusDay(sql) rdb = Redis(**settings['redis']) crsp = CRSP(sql, bd, rdb) pstat = PSTAT(sql, bd) bea = BEA(rdb, **settings['bea'], echo=ECHO) logdir = None # os.path.join(settings['images'], 'bea') years = np.arange(1947, 2020) vintages = [1997, 1963, 1947] # when sectoring schemes were revised # Read IOUse tables from BEA website ioUses = dict() for vintage in vintages: for year in [y for y in years if y >= vintage]: df = bea.read_ioUse(year, vintage=vintage) ioUses[(vintage, year)] = df print(f"{len(ioUses)} tables through sectoring vintage year {vintage}") # Smooth average flows over several years, for social relations regression
""" import pandas as pd from pandas import DataFrame, Series import numpy as np import matplotlib.pyplot as plt import os import time from finds.database import SQL, Redis from finds.structured import CRSP from finds.busday import BusDay, Weekly from settings import settings sql = SQL(**settings['sql']) user = SQL(**settings['user']) rdb = Redis(**settings['redis']) bd = BusDay(sql) crsp = CRSP(sql, bd, rdb) logdir = os.path.join(settings['images'], 'weekrev') # Construct weekly reversal rebalbeg = 19730629 # increased stocks coverage in CRSP from around this date rebalend = 20210101 # a Friday, so can include last week in 2020 wd = Weekly(sql, 'Fri', rebalbeg, rebalend) # Generate Friday-end weekly cal # Retrieve weekly returns, standardize scores, and compute returns and i.c. june_universe = 0 # to track when reached a June end to update universe year = 0 # to track new year to retrieve prices in batch for screening res = DataFrame() tic = time.time() for rebaldate in wd.date_range(rebalbeg, rebalend)[:-1]: d = bd.june_universe(rebaldate) if d != june_universe:
import os import time from finds.database import SQL from finds.busday import BusDay from finds.structured import PSTAT, CRSP, Benchmarks from finds.backtesting import EventStudy from settings import settings LAST_DATE = settings['crsp_date'] ECHO = True sql = SQL(**settings['sql'], echo=ECHO) user = SQL(**settings['user'], echo=ECHO) bd = BusDay(sql) keydev = PSTAT(sql, bd) crsp = CRSP(sql, bd, rdb=None) bench = Benchmarks(sql, bd) eventstudy = EventStudy(user, bench, LAST_DATE) outdir = os.path.join(settings['images'], 'events') # event window parameters end = 20201201 beg = 19890101 # 20020101 minobs = 250 left, right, post = -1, 1, 21 # str formatter to pretty print event and role description given their id's eventformat = lambda e, r: "{event} ({eventid}) {role} [{roleid}]".format( event=keydev.event_[e], eventid=e, role=keydev.role_[r], roleid=r) events = sorted(keydev.event_.keys()) # list of eventid's roles = sorted(keydev.role_.keys()) # list of roleid's
from tqdm import tqdm from collections import Counter from finds.database import SQL, MongoDB, Redis from finds.structured import CRSP, PSTAT from finds.busday import BusDay, int2date from finds.unstructured import Unstructured from finds.edgar import EdgarClone, Edgar from finds.graph import igraph_info, igraph_community from finds.sectors import Sectoring from settings import settings ECHO = True sql = SQL(**settings['sql']) user = SQL(**settings['user']) bd = BusDay(sql) rdb = Redis(**settings['redis']) crsp = CRSP(sql, bd, rdb) mongodb = MongoDB(**settings['mongodb']) wordlists = Unstructured(mongodb, 'WordLists') ed = EdgarClone(settings['10X'], zipped=True, echo=ECHO) imgdir = os.path.join(settings['images'], 'edgar') item, form = 'bus10K', '10-K' # Retrieve business description (10-K item 1) for 'aapl' from Edgar from nltk.tokenize import RegexpTokenizer ticker = 'AAPL' cik = Edgar.fetch_tickers()[ticker.lower()] # lookup aapl's cik stop_words = [w for c in ['genericlong', 'DatesandNumbers'] # LM stop word lists for w in wordlists['lm', c.lower()]] # if "'" not in w] top_words = {} for year in [2003, 2020]: files = Edgar.fetch_index(year=year, quarter=4) #
return df if False: import os from finds.sectors import Sectoring, BEA from finds.database import SQL, Redis from finds.structured import PSTAT, CRSP from finds.busday import BusDay from settings import settings downloads = settings['remote'] sql = SQL(**settings['sql']) bd = BusDay(sql) rdb = Redis(**settings['redis']) # None crsp = CRSP(sql, bd, rdb) pstat = PSTAT(sql, bd) bea = BEA(rdb, **settings['bea']) scheme = 'sic2' codes = Sectoring(sql, scheme) scheme = 'sic3' codes = Sectoring(sql, scheme) for scheme in [5, 10, 12, 17, 30, 38, 48, 49]: codes = Sectoring(sql, f"codes{scheme}") codes.load(source=None) scheme = 'SIC' # SIC from NAICS Crosswalk codes = Sectoring(sql, scheme)
from datetime import datetime from settings import settings from finds.database import SQL, Redis from finds.structured import PSTAT, CRSP, IBES, Benchmarks, Signals from finds.busday import BusDay, Weekly from finds.structured import as_signal from finds.backtesting import BackTest from finds.solve import fractiles LAST_DATE = settings['crsp_date'] sql = SQL(**settings['sql']) user = SQL(**settings['user']) rdb = Redis(**settings['redis']) bd = BusDay(sql) pstat = PSTAT(sql, bd) crsp = CRSP(sql, bd, rdb) ibes = IBES(sql, bd) bench = Benchmarks(sql, bd) signals = Signals(user) backtest = BackTest(user, bench, 'RF', LAST_DATE) outdir = os.path.join(settings['images'], 'factors') # signals to flip signs when forming spread portfolios flips = {'mom1m':-1, 'mom36m':-1, 'pricedelay':-1, 'absacc':-1, 'acc':-1, 'agr':-1, 'chcsho':-1, 'egr':-1, 'mve_ia':-1, 'pctacc':-1, 'aeavol':-1, 'disp':-1, 'stdacc':-1, 'stdcf':-1, 'secured':-1, 'maxret':-1, 'ill':-1, 'zerotrade':-1, 'cashpr':-1, 'chinv':-1, 'invest':-1, 'cinvest':-1} ## Helpers to lag characteristics and roll returns def as_lags(df, var, key, nlags): """Return dataframe with {nlags} of column {var}, same {key} value in row"""
df = fetch_FamaFrench(name=name, item=item, suffix=suffix, index_formatter=bd.offset) for col in df.columns: bench.load_series(df[col], name=name, item=item) print(DataFrame(**sql.run('select * from ' + bench['ident'].key))) """ """Weekly: price update and clear affected redis store run yahoo ./redis-cli --scan --pattern '*CRSP_2021*' | xargs ./redis-cli del """ # Estimate daily factors LAST_DATE = 20210618 # last date in daily prices table bd = BusDay(sql) bench = Benchmarks(sql, bd) crsp = CRSP(sql, bd, rdb) pstat = PSTAT(sql, bd) ## Rebalance and return dates, and initialize classes for calculations rebalbeg = bd.offset(20190630) rebals = [bd.offset(d) for d in [20200630]] stocks = chunk_stocks(crsp, rebalbeg, LAST_DATE) perf = DailyPerformance(stocks) # Compute HML factor label = 'hml' lag = 6 # number of months to lag fundamental data df = pstat.get_linked( # retrieve required fields from compustat dataset='annual', date_field='datadate', fields=['seq', 'pstk', 'pstkrv', 'pstkl', 'txditc'],
new = set(np.unique(df['ticker'])).difference( set(np.unique(prices['ticker']))) df = df[df['ticker'].isin(new)] prices = prices.append(df, sort=False) print(pathname, 'added prices', new) df = pd.read_csv(os.path.join(pathname, 'dividends.csv.gz'), sep='|') new = set(np.unique(df['ticker'])).difference( set(np.unique(dividends['ticker']))) df = df[df['ticker'].isin(new)] dividends = dividends.append(df, sort=False) print(pathname, 'added dividends', new) sql = SQL(**config.credentials['sql'], echo=config.ECHO) bd = BusDay(sql) crsp = CRSP(sql, bd, rdb=None) date = bd.offset(crsp_date) # get price and shrout as of last date price = crsp.get_section('daily', ['prc', 'shrout'], 'date', date, start=None) # get tickers to lookup permno tickers = crsp.get_section('names', ['tsymbol', 'date'], 'date', date, start=0).reindex(price.index) tickers = tickers.sort_values(['tsymbol', 'date'])\ .drop_duplicates(keep='last')
import statsmodels.formula.api as smf import pandas_datareader as pdr from pandas_datareader.data import DataReader from pandas_datareader.famafrench import FamaFrenchReader from finds.database import SQL, Redis from finds.structured import CRSP, Signals, Benchmarks from finds.busday import BusDay from finds.backtesting import RiskPremium from finds.solve import winsorized from settings import settings LAST_DATE = settings['crsp_date'] sql = SQL(**settings['sql']) user = SQL(**settings['user']) rdb = Redis(**settings['redis']) bd = BusDay(sql) crsp = CRSP(sql, bd, rdb) bench = Benchmarks(sql, bd) signals = Signals(user) logdir = os.path.join(settings['images'], 'fm') def least_squares(data=None, y=['y'], x=['x'], stdres=False): """Helper to compute least square coefs, supports groupby().apply""" X = data[x].to_numpy() Y = data[y].to_numpy() X = np.hstack([np.ones((X.shape[0], 1)), X]) x = ['Intercept'] + x b = np.dot(np.linalg.inv(np.dot(X.T, X)), np.dot(X.T, Y)).T if stdres: b = np.hstack([b, np.std(Y - (X @ b.T), axis=0).reshape(-1, 1)]) x = x + ['stdres']
from nltk.tokenize import RegexpTokenizer from tqdm import tqdm from finds.database import SQL, MongoDB, Redis from finds.structured import CRSP, Signals from finds.busday import BusDay from finds.unstructured import Unstructured from finds.readers import fetch_lm from finds.solve import weighted_average, fractiles from finds.edgar import EdgarClone from settings import settings ECHO = False sql = SQL(**settings['sql']) user = SQL(**settings['user']) bd = BusDay(sql) rdb = Redis(**settings['redis']) crsp = CRSP(sql, bd, rdb) ed = EdgarClone(settings['10X'], zipped=True, echo=ECHO) signals = Signals(user) mongodb = MongoDB(**settings['mongodb']) wordlists = Unstructured(mongodb, 'WordLists') imgdir = os.path.join(settings['images'], 'edgar') item, form = 'mda10K', '10-K' # Load Loughran and MacDonald sentiment words and stopwords mongodb = MongoDB() wordlists = Unstructured(mongodb, 'WordLists') sentiments = {k: wordlists['lm', k] for k in ['positive', 'negative']} # Pre-process with sklearn methods tf_vectorizer = sklearn.feature_extraction.text.CountVectorizer( strip_accents='unicode',