if 'new' in testable:
    with open(os.path.join(outdir, 'index.html'), 'wt') as f:
        f.write('<h1>Quant Factors Zoo</h1><br>')
        f.write(' <br>')
        f.write('<p>\n')

# Momentum and divyld from CRSP monthly
if 'monthly' in testable:
    if regenerate:
        beg, end = 19251231, LAST_DATE
        intervals = {'mom12m': (2,12), 'mom36m': (13,36),
                     'mom6m': (2,6), 'mom1m': (1,1)}
        for label, past in intervals.items():
            out = DataFrame()
            for rebaldate in bd.date_range(bd.endmo(beg, past[1]), end, 'endmo'):
                start = bd.endmo(rebaldate, -past[1])
                beg1 = bd.offset(start, 1)
                end1 = bd.endmo(rebaldate, 1-past[0])
                df = crsp.get_universe(end1)
                df['start'] = crsp.get_section(dataset='monthly', fields=['ret'],
                                               date_field='date', date=start)\
                                  .reindex(df.index)
                df[label] = crsp.get_ret(beg1, end1).reindex(df.index)
                df['permno'] = df.index
                df['rebaldate'] = rebaldate
                df = df.dropna(subset=['start'])
                out = out.append(df[['rebaldate', 'permno', label]],
                                 ignore_index=True)    # append rows
            n = signals.write(out, label, overwrite=True)
    ax.set_title(f"Top Bigrams: {ticker} {year} 10-K Business Description")
    #plt.savefig(os.path.join(imgdir, f"{ticker}{year}.jpg"))
plt.show()


# Community Detection with Business Descriptions
# Load spacy vocab
lang = 'en_core_web_lg'
nlp = spacy.load(lang)
n_vocab, vocab_dim = nlp.vocab.vectors.shape
print('Language:', lang, '   vocab:', n_vocab, '   dim:', vocab_dim)
stopwords = {'company', 'companys', 'companies', 'product', 'products',
             'service', 'services', 'business', 'description', 'year', 'years'}

# Load stock universes
univs = {y: crsp.get_universe(bd.endmo(int(f"{y-1}1231"))).assign(year=y)
         for y in range(1993, 2021)}


## Extract lemmatized nouns and named entities from bus10K documents
A = ed.open(form=form, item=item)  # open bus10K archive
A['year'] = [d.year-(d.month<4) for d in int2date(A['date'])] # set fiscal year

tic = time.time()
for year in [2020, 2019, 2018, 2017]:
    docs = dict()
    ners = dict()
    for i, permno in tqdm(enumerate(sorted(univs[year].index))):
        doc = A[A['permno'].eq(permno) & A['year'].eq(year)].sort_values('date')
        if len(doc):
            sent = ed[doc.iloc[0]['pathname']].encode('ascii', 'ignore').lower()
## compute HML portfolio holdings
signals = chunk_signal(df)
holdings = famafrench_sorts(crsp,
                            'hml',
                            signals,
                            rebalbeg,
                            LAST_DATE,
                            window=12,
                            months=[6],
                            rebals=rebals)['holdings']

# Compute MOM momentum factor
label = 'mom'
past = (2, 12)
df = []  # collect each month's momentum signal values
rebalend = bd.endmo(LAST_DATE, -1)
for rebaldate in bd.date_range(rebalbeg, rebalend, 'endmo'):
    beg = bd.endmo(rebaldate, -past[1])  # require price at this date
    start = bd.offset(beg, 1)  # start date, inclusive, of signal
    end = bd.endmo(rebaldate, 1 - past[0])  # end date of signal
    p = [
        crsp.get_universe(rebaldate),  # retrieve prices and construct signal
        crsp.get_ret(start, end)['ret'].rename(label),
        crsp.get_section('monthly', ['prc'], 'date', beg)['prc'].rename('beg'),
        crsp.get_section('monthly', ['prc'], 'date', end)['prc'].rename('end')
    ]
    q = pd.concat(p, axis=1, join='inner').reset_index().dropna()
    q['rebaldate'] = rebaldate
    df.append(q[['permno', 'rebaldate', label]])
    print(rebaldate, len(df), len(q))
df = pd.concat(df)
                  & (facpr['close'] - facpr['prc'].abs()).abs().ge(0.009)
                  & (facpr['prc'].gt(0) |
                     (facpr['close'] - facpr['prc'].abs()).abs().gt(0.25))]

    # adjust shrout if necessary, then merge into big df of prices
    facpr['shrout'] *= (1 + facpr['facpr'])
    facpr = facpr.set_index('permno')
    price.loc[facpr.index, 'shrout'] = facpr.loc[facpr.index, 'shrout']
    prices = prices.join(price[['shrout']], on='permno', how='inner')

    # create monthly data df
    prices['month'] = prices['date'] // 100
    groupby = prices.groupby(['permno', 'month'])
    monthly = (groupby[['ret', 'retx']].prod() - 1)\
              .join(groupby['close'].last()).reset_index()
    monthly['month'] = bd.endmo(monthly['month'])
    monthly.rename(columns={'month': 'date', 'close': 'prc'}, inplace=True)
    monthly = monthly[monthly['date'] > date]
    print('monthly:', len(monthly), len(np.unique(monthly['permno'])))

    # clean up prices table to mimic daily table
    prices['ret'] -= 1
    prices['retx'] -= 1
    prices.rename(columns={
        'open': 'openprc',
        'high': 'askhi',
        'low': 'bidlo',
        'close': 'prc',
        'volume': 'vol'
    },
                  inplace=True)
print(
    ls.get_robustcov_results('hac-panel', groups=rets['port'],
                             maxlags=3).summary())
print(ls.get_robustcov_results('cluster', groups=rets['port']).summary())

## Fama MacBeth with individual stocks and standardized scores as loadings
rebalbeg = 19640601
rebalend = LAST_DATE
rebaldates = crsp.bd.date_range(rebalbeg, rebalend, 'endmo')
loadings = dict()
for pordate in rebaldates:  # retrieve signal values every month
    date = bd.june_universe(pordate)
    univ = crsp.get_universe(date)
    cap = np.sqrt(crsp.get_cap(date)['cap'])
    smb = -np.log(cap).rename('size')
    hml = signals('hml', date, bd.endmo(date, -12))['hml'].rename('value')
    beta = (signals('beta', pordate, bd.begmo(pordate))['beta'] * 2 / 3) + (1 /
                                                                            3)
    mom = signals('mom', pordate)['mom'].rename('momentum')
    df = pd.concat(
        (beta, hml, smb, mom),  # inner join of signals with univ
        join='inner',
        axis=1).reindex(univ.index).dropna()
    loadings[pordate] = winsorized(df, quantiles=[0.05, 0.95])

## Compute coefficients from FM cross-sectional regressions
riskpremium = RiskPremium(user, bench, 'RF', LAST_DATE)
riskpremium(
    crsp,
    loadings,  # FM regressions on standardized scores
    weights=None,
示例#6
0
wordlists = Unstructured(mongodb, 'WordLists')
sentiments = {k: wordlists['lm', k] for k in ['positive', 'negative']}

# Pre-process with sklearn methods
tf_vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    strip_accents='unicode',
    lowercase=True,
    #stop_words=stop_words,
    # tokenizer=CustomTokenizer(),
    token_pattern=r"\b[^\d\W][^\d\W][^\d\W]+\b")
tokenize = tf_vectorizer.build_tokenizer()
analyze = tf_vectorizer.build_analyzer()

# Construct sentiment feature all years for usual universe
univs = {
    yr + 1: crsp.get_universe(bd.endmo(yr * 10000 + 1231)).assign(year=yr + 1)
    for yr in range(1992, 2020)
}
results = []
files = ed.open(form=form, item=item)  # open mda10K archive
tic = time.time()
permnos = files['permno'].unique().astype(int)
for i, permno in tqdm(enumerate(permnos)):  # Loop over all permnos

    # retrieve all valid mda's for this permno by year
    mdas = {}
    dates = {}
    for _, f in files[files['permno'].eq(permno)].iterrows():
        year = int(f['date']) // 10000
        if ((f['date'] // 100) % 100) <= 3:  # if filing date <= Mar
            year = year - 1  # then assign to previous year
示例#7
0
plot_ff(y,
        label,
        corr=np.corrcoef(backtest.excess, rowvar=False)[0, 1],
        num=2,
        logdir=logdir)

## Construct Mom

# Load monthly universe and stock returns from CRSP.
# Signal is stocks' total return from 12 months ago, skipping most recent month
# Construct 2-way portfolio sorts, and backtest returns
label, benchname, past, leverage = 'mom', 'Mom(mo)', (2, 12), 1
rebalbeg, rebalend = 19260101, LAST_DATE
df = []  # collect each month's momentum signal values
for rebaldate in bd.date_range(rebalbeg, rebalend, 'endmo'):
    beg = bd.endmo(rebaldate, -past[1])  # require price at this date
    start = bd.offset(beg, 1)  # start date, inclusive, of signal
    end = bd.endmo(rebaldate, 1 - past[0])  # end date of signal
    p = [
        crsp.get_universe(rebaldate),  # retrieve prices and construct signal
        crsp.get_ret(start, end)['ret'].rename(label),
        crsp.get_section('monthly', ['prc'], 'date', beg)['prc'].rename('beg'),
        crsp.get_section('monthly', ['prc'], 'date', end)['prc'].rename('end')
    ]
    q = pd.concat(p, axis=1, join='inner').reset_index().dropna()
    q['rebaldate'] = rebaldate
    df.append(q[['permno', 'rebaldate', label]])
    print(rebaldate, len(df), len(q))
df = pd.concat(df)
signals.write(df, label, overwrite=True)
portfolios = famafrench_sorts(crsp,