'realestate', 'salecash', 'salerec', 'saleinv', 'secured',
               'sgr', 'sp', 'tang', 'bm_ia', 'cfp_ia', 'chatoia' , 'chpmia',
               'pchcapx_ia', 'chempia', 'mve_ia']
    numlag = 6       # number of months to lag data for rebalance
    end = LAST_DATE   # last data date

    if regenerate:
        # retrieve annual, keep [permno, datadate] with non null prccq if any
        fields = ['sic', 'fyear', 'ib', 'oancf', 'at', 'act', 'che', 'lct',
                  'dlc', 'dltt', 'prcc_f', 'csho', 'invt', 'dp', 'ppent',
                  'dvt', 'ceq', 'txp', 'revt', 'cogs', 'rect', 'aco', 'intan',
                  'ao', 'ap', 'lco', 'lo', 'capx', 'emp', 'ppegt', 'lt',
                  'sale', 'xsga', 'xrd', 'fatb', 'fatl', 'dm']
        df = pstat.get_linked(
            dataset='annual', date_field='datadate', fields=fields,
            where=(f"indfmt = 'INDL' AND datafmt = 'STD' AND "
                   f"curcd = 'USD' AND popsrc = 'D' AND consol = 'C' AND "
                   f"datadate <= {end//100}31"))
        fund = df.sort_values(['permno', 'datadate', 'ib'])\
                 .drop_duplicates(['permno', 'datadate'])\
                 .dropna(subset=['ib'])
        fund.index = list(zip(fund['permno'], fund['datadate']))  # multi-index
        fund['rebaldate'] = bd.endmo(fund.datadate, numlag)

        # precompute, and lag common metrics: mve_f avg_at sic2
        fund['sic2'] = np.where(fund['sic'].notna(), fund['sic'] // 100, 0)
        fund['fyear'] = fund['datadate'] // 10000      # can delete this
        fund['mve_f'] = fund['prcc_f'] * fund['csho']
    
        lag = fund.shift(1, fill_value=0)
        lag.loc[lag['permno'] != fund['permno'], fields] = np.nan
crsp = CRSP(sql, bd, rdb)
pstat = PSTAT(sql, bd)

## Rebalance and return dates, and initialize classes for calculations
rebalbeg = bd.offset(20190630)
rebals = [bd.offset(d) for d in [20200630]]
stocks = chunk_stocks(crsp, rebalbeg, LAST_DATE)
perf = DailyPerformance(stocks)

# Compute HML factor
label = 'hml'
lag = 6  # number of months to lag fundamental data
df = pstat.get_linked(  # retrieve required fields from compustat
    dataset='annual',
    date_field='datadate',
    fields=['seq', 'pstk', 'pstkrv', 'pstkl', 'txditc'],
    where=(f"indfmt = 'INDL' AND datafmt = 'STD' AND curcd = 'USD' "
           f"  AND popsrc = 'D' AND consol = 'C' "
           f"  AND seq > 0 AND datadate >= 20141201"))

## subtract preferred stock, add back deferred investment tax credit
df[label] = np.where(df['pstkrv'].isna(), df['pstkl'], df['pstkrv'])
df[label] = np.where(df[label].isna(), df['pstk'], df[label])
df[label] = np.where(df[label].isna(), 0, df[label])
df[label] = df['seq'] + df['txditc'].fillna(0) - df[label]
df.dropna(subset=[label], inplace=True)
df = df[df[label] > 0][['permno', 'gvkey', 'datadate', label]]

## years in Compustat
df = df.sort_values(by=['gvkey', 'datadate'])
df['count'] = df.groupby(['gvkey']).cumcount()