示例#1
0
def famaMacBethRegression12(dates, firms, mdatabase, pdatabase1, pdatabase2,
                            t1, t2):
    """
    Average coefficients for daily cross-sectional regression over dates given
    dates: list of days in order from starting date to ending date, each date represents a date t used for computation
    firms: list of tickers of interest
    mdatabase: database of news measures
    pdatabase1: crsp data frame
    pdatabase2: compustat data frame
    t1, t2: used for AbnRet date range [t+t1, t+t2] where t is current reference day
    Returns tuple (dictionary mapping coefficients to values, dictionary mapping coefficients to their standard errors)
    """
    # append t2 additional days at the end for last few dependent variable queries
    if not pdatabase1.dates:
        pdatabase1.recordDates("date", False)  # "date" is a col name in crsp
    extra_day_start_index = pdatabase1.dates.index(int(
        dates[len(dates) - 1])) + 1
    for i in range(t2):
        dates.append(str(pdatabase1.dates[extra_day_start_index + i]))
    # running sum of coefficients as cross-sectional regressions are computed
    coefficients = {
        'a': 0,
        'b1': 0,
        'b2': 0,
        'b3': 0,
        's1': 0,
        's2': 0,
        'g1': 0,
        'g2': 0,
        'g3': 0,
        'g4': 0,
        'g5': 0,
        'g6': 0,
        'g7': 0,
        'g8': 0,
        'g9': 0
    }
    standard_errrors = {
        'ase': 0,
        'b1se': 0,
        'b2se': 0,
        'b3se': 0,
        's1se': 0,
        's2se': 0,
        'g1se': 0,
        'g2se': 0,
        'g3se': 0,
        'g4se': 0,
        'g5se': 0,
        'g6se': 0,
        'g7se': 0,
        'g8se': 0,
        'g9se': 0
    }
    # for average at end
    unused_dates = []
    # -t2 to account for extra days appended
    for i in range(len(dates) - t2):
        print("DAY T: " + dates[i])
        # compute daily cross-sectional regression
        lists = {
            'dependent': [],
            'AbnPcrOld': [],
            'AbnPcrOldXAbnRet': [],
            'AbnRet': [],
            'AbnPcrRecombinations': [],
            'AbnPcrRecombinationsXAbnRet': [],
            'Stories': [],
            'AbnStories': [],
            'Terms': [],
            'MCap': [],
            'BM': [],
            'AbnRetVect': [],
            'AbnVol': [],
            'AbnVolitility': [],
            'Illiq': []
        }
        for firm in firms:
            # skip firms where no data is available on date
            dependent_var = u.abnormalReturn(firm, dates[i + t1],
                                             dates[i + t2], pdatabase1, False)
            if dependent_var == -1:
                continue
            abn_pcr_old = u.abnormalPercentageOld(firm, dates[i], mdatabase)
            if abn_pcr_old == -1:
                continue
            abn_ret_next = u.abnormalReturnDate(firm, dates[i + 1], pdatabase1,
                                                False)
            if abn_ret_next == -1:
                continue
            abn_pcr_rec = u.abnormalPercentageRecombinations(
                firm, dates[i], mdatabase)
            if abn_pcr_rec == -1:
                continue
            x = u.generateXList(firm, dates[i], mdatabase, pdatabase1,
                                pdatabase2, False)
            if not x:
                continue
            lists['dependent'].append(dependent_var)
            lists['AbnPcrOld'].append(abn_pcr_old)
            lists['AbnPcrOldXAbnRet'].append(abn_pcr_old * abn_ret_next)
            lists['AbnRet'].append(abn_ret_next)
            lists['AbnPcrRecombinations'].append(abn_pcr_rec)
            lists['AbnPcrRecombinationsXAbnRet'].append(abn_pcr_rec *
                                                        abn_ret_next)
            lists['Stories'].append(x[0])
            lists['AbnStories'].append(x[1])
            lists['Terms'].append(x[2])
            lists['MCap'].append(x[3])
            lists['BM'].append(x[4])
            lists['AbnRetVect'].append(x[5])
            lists['AbnVol'].append(x[6])
            lists['AbnVolitility'].append(x[7])
            lists['Illiq'].append(x[8])
        # Invalid date
        if len(lists['dependent']) == 0:
            unused_dates.append(dates[i])
            continue
        # Create pandas data frame and run regression with statsmodels
        df = pd.DataFrame({
            "Y": lists['dependent'],
            "B1": lists['AbnPcrOld'],
            "B2": lists['AbnPcrOldXAbnRet'],
            "B3": lists['AbnRet'],
            "S1": lists['AbnPcrRecombinations'],
            "S2": lists['AbnPcrRecombinationsXAbnRet'],
            "G1": lists['Stories'],
            "G2": lists['AbnStories'],
            "G3": lists['Terms'],
            "G4": lists['MCap'],
            "G5": lists['BM'],
            "G6": lists['AbnRetVect'],
            "G7": lists['AbnVol'],
            "G8": lists['AbnVolitility'],
            "G9": lists['Illiq']
        })
        # 'HAC' for heteroscedasticity and autocorrelation, statsmodels uses Newey-West SE by default
        result = sm.ols(
            formula=
            "Y ~ B1 + B2 + B3 + S1 + S2 + G1 + G2 + G3 + G4 + G5 + G6 + G7 + G8 + G9",
            data=df).fit(cov_type='HAC', cov_kwds={'maxlags': 1})
        coefficients['a'] += result.params.Intercept
        coefficients['b1'] += result.params.B1
        coefficients['b2'] += result.params.B2
        coefficients['b3'] += result.params.B3
        coefficients['s1'] += result.params.S1
        coefficients['s2'] += result.params.S2
        coefficients['g1'] += result.params.G1
        coefficients['g2'] += result.params.G2
        coefficients['g3'] += result.params.G3
        coefficients['g4'] += result.params.G4
        coefficients['g5'] += result.params.G5
        coefficients['g6'] += result.params.G6
        coefficients['g7'] += result.params.G7
        coefficients['g8'] += result.params.G8
        coefficients['g9'] += result.params.G9
        standard_errrors['ase'] += result.bse.Intercept**2
        standard_errrors['b1se'] += result.bse.B1**2
        standard_errrors['b2se'] += result.bse.B2**2
        standard_errrors['b3se'] += result.bse.B3**2
        standard_errrors['s1se'] += result.bse.S1**2
        standard_errrors['s2se'] += result.bse.S2**2
        standard_errrors['g1se'] += result.bse.G1**2
        standard_errrors['g2se'] += result.bse.G2**2
        standard_errrors['g3se'] += result.bse.G3**2
        standard_errrors['g4se'] += result.bse.G4**2
        standard_errrors['g5se'] += result.bse.G5**2
        standard_errrors['g6se'] += result.bse.G6**2
        standard_errrors['g7se'] += result.bse.G7**2
        standard_errrors['g8se'] += result.bse.G8**2
        standard_errrors['g9se'] += result.bse.G9**2
    print(unused_dates)
    num_dates_used = len(dates) - t2 - len(
        unused_dates)  # -t2 to account for extra days appended
    print(num_dates_used)
    return {key: coefficients[key]/num_dates_used for key in coefficients}, \
           {key: (standard_errrors[key]/(num_dates_used**2))**0.5 for key in standard_errrors}
示例#2
0
def famaMacBethRegression8_9(dates,
                             firms,
                             mdatabase,
                             pdatabase1,
                             pdatabase2,
                             eight=True):
    """
    Average coefficients for daily cross-sectional regression over dates given
    dates: list of days in order from starting date to ending date, each date represents a date t used for computation
    firms: list of tickers of interest
    mdatabase: database of news measures
    pdatabase1: crsp data frame
    pdatabase2: compustat data frame
    eight: True computes equation 8, False computes equation 9
    Returns tuple (dictionary mapping coefficients to values, dictionary mapping coefficients to their standard errors)
    """
    # append one day at the end for very last t+1 query
    if not pdatabase1.dates:
        pdatabase1.recordDates("date", False)  # "date" is a col name in crsp
    extra_day_index = pdatabase1.dates.index(int(dates[len(dates) - 1])) + 1
    dates.append(str(pdatabase1.dates[extra_day_index]))
    # running sum of coefficients as cross-sectional regressions are computed
    coefficients = {
        'a': 0,
        'b': 0,
        'g1': 0,
        'g2': 0,
        'g3': 0,
        'g4': 0,
        'g5': 0,
        'g6': 0,
        'g7': 0,
        'g8': 0,
        'g9': 0
    }
    standard_errrors = {
        'ase': 0,
        'bse': 0,
        'g1se': 0,
        'g2se': 0,
        'g3se': 0,
        'g4se': 0,
        'g5se': 0,
        'g6se': 0,
        'g7se': 0,
        'g8se': 0,
        'g9se': 0
    }
    # for average at end
    unused_dates = []
    # -1 to account for extra day appended
    for i in range(len(dates) - 1):
        print("DAY T: " + dates[i])
        # compute daily cross-sectional regression
        lists = {
            'dependent': [],
            'AbnPctOld': [],
            'Stories': [],
            'AbnStories': [],
            'Terms': [],
            'MCap': [],
            'BM': [],
            'AbnRet': [],
            'AbnVol': [],
            'AbnVolitility': [],
            'Illiq': []
        }
        for firm in firms:
            # skip firms where no data is available on date
            if eight:
                dependent_var = u.abnormalReturnDate(firm, dates[i + 1],
                                                     pdatabase1, False)
                if dependent_var == -1:
                    continue
            else:
                dependent_var = u.abnormalVolDate(firm, dates[i + 1],
                                                  pdatabase1, False)
                if dependent_var == -1:
                    continue
            abn_pct_old = u.abnormalPercentageOld(firm, dates[i], mdatabase)
            if abn_pct_old == -1:
                continue
            x = u.generateXList(firm, dates[i], mdatabase, pdatabase1,
                                pdatabase2, False)
            if not x:
                continue
            if eight:
                lists['dependent'].append(abs(dependent_var))
            else:
                lists['dependent'].append(dependent_var)
            lists['AbnPctOld'].append(abn_pct_old)
            lists['Stories'].append(x[0])
            lists['AbnStories'].append(x[1])
            lists['Terms'].append(x[2])
            lists['MCap'].append(x[3])
            lists['BM'].append(x[4])
            lists['AbnRet'].append(x[5])
            lists['AbnVol'].append(x[6])
            lists['AbnVolitility'].append(x[7])
            lists['Illiq'].append(x[8])
        # Invalid date
        if len(lists['dependent']) == 0:
            unused_dates.append(dates[i])
            continue
        # Create pandas data frame and run regression with statsmodels
        df = pd.DataFrame({
            "Y": lists['dependent'],
            "B": lists['AbnPctOld'],
            "G1": lists['Stories'],
            "G2": lists['AbnStories'],
            "G3": lists['Terms'],
            "G4": lists['MCap'],
            "G5": lists['BM'],
            "G6": lists['AbnRet'],
            "G7": lists['AbnVol'],
            "G8": lists['AbnVolitility'],
            "G9": lists['Illiq']
        })
        # 'HAC' for heteroscedasticity and autocorrelation, statsmodels uses Newey-West SE by default
        result = sm.ols(
            formula="Y ~ B + G1 + G2 + G3 + G4 + G5 + G6 + G7 + G8 + G9",
            data=df).fit(cov_type='HAC', cov_kwds={'maxlags': 1})
        coefficients['a'] += result.params.Intercept
        coefficients['b'] += result.params.B
        coefficients['g1'] += result.params.G1
        coefficients['g2'] += result.params.G2
        coefficients['g3'] += result.params.G3
        coefficients['g4'] += result.params.G4
        coefficients['g5'] += result.params.G5
        coefficients['g6'] += result.params.G6
        coefficients['g7'] += result.params.G7
        coefficients['g8'] += result.params.G8
        coefficients['g9'] += result.params.G9
        standard_errrors['ase'] += result.bse.Intercept**2
        standard_errrors['bse'] += result.bse.B**2
        standard_errrors['g1se'] += result.bse.G1**2
        standard_errrors['g2se'] += result.bse.G2**2
        standard_errrors['g3se'] += result.bse.G3**2
        standard_errrors['g4se'] += result.bse.G4**2
        standard_errrors['g5se'] += result.bse.G5**2
        standard_errrors['g6se'] += result.bse.G6**2
        standard_errrors['g7se'] += result.bse.G7**2
        standard_errrors['g8se'] += result.bse.G8**2
        standard_errrors['g9se'] += result.bse.G9**2
    print(unused_dates)
    num_dates_used = len(dates) - 1 - len(
        unused_dates)  # -1 to account for extra day appended
    print(num_dates_used)
    return {key: coefficients[key]/num_dates_used for key in coefficients}, \
           {key: (standard_errrors[key]/(num_dates_used**2))**0.5 for key in standard_errrors}
示例#3
0
def generate_csv12(dates, firms, mdatabase, pdatabase1, pdatabase2, t1, t2):
    """
    Writes csv file for computation over dates given
    dates: list of days in order from starting date to ending date, each date represents a date t used for computation
    firms: list of tickers of interest
    mdatabase: database of news measures
    pdatabase1: crsp data frame
    pdatabase2: compustat data frame
    t1, t2: used for AbnRet date range [t+t1, t+t2] where t is current reference day
    """
    # append t2 additional days at the end for last few dependent variable queries
    if not pdatabase1.dates:
        pdatabase1.recordDates("date", False)  # "date" is a col name in crsp
    extra_day_start_index = pdatabase1.dates.index(int(
        dates[len(dates) - 1])) + 1
    for i in range(t2):
        dates.append(str(pdatabase1.dates[extra_day_start_index + i]))
    # store data
    lists = {
        'dependent': [],
        'AbnPcrOld': [],
        'AbnPcrOldXAbnRet': [],
        'AbnRet': [],
        'AbnPcrRecombinations': [],
        'AbnPcrRecombinationsXAbnRet': [],
        'Stories': [],
        'AbnStories': [],
        'Terms': [],
        'MCap': [],
        'BM': [],
        'AbnRetVect': [],
        'AbnVol': [],
        'AbnVolitility': [],
        'Illiq': [],
        'date': []
    }
    entries = 0
    # -t2 to account for extra days appended
    for i in range(len(dates) - t2):
        print("DAY T: " + dates[i])
        for firm in firms:
            # skip firms where no data is available on date
            dependent_var = u.abnormalReturn(firm, dates[i + t1],
                                             dates[i + t2], pdatabase1, False)
            if dependent_var == -1:
                continue
            abn_pcr_old = u.abnormalPercentageOld(firm, dates[i], mdatabase)
            if abn_pcr_old == -1:
                continue
            abn_ret_next = u.abnormalReturnDate(firm, dates[i + 1], pdatabase1,
                                                False)
            if abn_ret_next == -1:
                continue
            abn_pcr_rec = u.abnormalPercentageRecombinations(
                firm, dates[i], mdatabase)
            if abn_pcr_rec == -1:
                continue
            x = u.generateXList(firm, dates[i], mdatabase, pdatabase1,
                                pdatabase2, False)
            if not x:
                continue
            lists['dependent'].append(dependent_var)
            lists['AbnPcrOld'].append(abn_pcr_old)
            lists['AbnPcrOldXAbnRet'].append(abn_pcr_old * abn_ret_next)
            lists['AbnRet'].append(abn_ret_next)
            lists['AbnPcrRecombinations'].append(abn_pcr_rec)
            lists['AbnPcrRecombinationsXAbnRet'].append(abn_pcr_rec *
                                                        abn_ret_next)
            lists['Stories'].append(x[0])
            lists['AbnStories'].append(x[1])
            lists['Terms'].append(x[2])
            lists['MCap'].append(x[3])
            lists['BM'].append(x[4])
            lists['AbnRetVect'].append(x[5])
            lists['AbnVol'].append(x[6])
            lists['AbnVolitility'].append(x[7])
            lists['Illiq'].append(x[8])
            lists['date'].append(dates[i])
            entries += 1
    # Create pandas data frame and to write out
    df = pd.DataFrame({
        "date":
        lists['date'],
        "dependent":
        lists['dependent'],
        "AbnPcrOld":
        lists['AbnPcrOld'],
        "AbnPcrOldXAbnRet":
        lists['AbnPcrOldXAbnRet'],
        "AbnRet":
        lists['AbnRet'],
        "AbnPcrRecombinations":
        lists['AbnPcrRecombinations'],
        "AbnPcrRecombinationsXAbnRet":
        lists['AbnPcrRecombinationsXAbnRet'],
        "Stories":
        lists['Stories'],
        "AbnStories":
        lists['AbnStories'],
        "Terms":
        lists['Terms'],
        "MCap":
        lists['MCap'],
        "BM":
        lists['BM'],
        "AbnRetVect":
        lists['AbnRetVect'],
        "AbnVol":
        lists['AbnVol'],
        "AbnVolitility":
        lists['AbnVolitility'],
        "Illiq":
        lists['Illiq']
    })
    print("ENTRIES: " + str(entries))
    print("COLUMNS: " + str(len(lists.keys())))
    # output fm_data_12_start_end_t1_t2.csv
    df.to_csv('fm_data_10_' + str(dates[0]) + "_" +
              str(dates[len(dates) - t2 - 1]) + "_" + str(t1) + "_" + str(t2) +
              ".csv",
              index=False)
示例#4
0
def generate_csv8_9(dates,
                    firms,
                    mdatabase,
                    pdatabase1,
                    pdatabase2,
                    eight=True):
    """
    Writes csv file for computation over dates given
    dates: list of days in order from starting date to ending date, each date represents a date t used for computation
    firms: list of tickers of interest
    mdatabase: database of news measures
    pdatabase1: crsp data frame
    pdatabase2: compustat data frame
    eight: True computes equation 8, False computes equation 9
    """
    # append one day at the end for very last t+1 query
    if not pdatabase1.dates:
        pdatabase1.recordDates("date", False)  # "date" is a col name in crsp
    extra_day_index = pdatabase1.dates.index(int(dates[len(dates) - 1])) + 1
    dates.append(str(pdatabase1.dates[extra_day_index]))
    # store data
    lists = {
        'dependent': [],
        'AbnPctOld': [],
        'Stories': [],
        'AbnStories': [],
        'Terms': [],
        'MCap': [],
        'BM': [],
        'AbnRet': [],
        'AbnVol': [],
        'AbnVolitility': [],
        'Illiq': [],
        'date': []
    }
    entries = 0
    # -1 to account for extra day appended
    for i in range(len(dates) - 1):
        print("DAY T: " + dates[i])
        for firm in firms:
            # skip firms where no data is available on date
            if eight:
                dependent_var = u.abnormalReturnDate(firm, dates[i + 1],
                                                     pdatabase1, False)
                if dependent_var == -1:
                    continue
            else:
                dependent_var = u.abnormalVolDate(firm, dates[i + 1],
                                                  pdatabase1, False)
                if dependent_var == -1:
                    continue
            abn_pct_old = u.abnormalPercentageOld(firm, dates[i], mdatabase)
            if abn_pct_old == -1:
                continue
            x = u.generateXList(firm, dates[i], mdatabase, pdatabase1,
                                pdatabase2, False)
            if not x:
                continue
            if eight:
                lists['dependent'].append(abs(dependent_var))
            else:
                lists['dependent'].append(dependent_var)
            lists['AbnPctOld'].append(abn_pct_old)
            lists['Stories'].append(x[0])
            lists['AbnStories'].append(x[1])
            lists['Terms'].append(x[2])
            lists['MCap'].append(x[3])
            lists['BM'].append(x[4])
            lists['AbnRet'].append(x[5])
            lists['AbnVol'].append(x[6])
            lists['AbnVolitility'].append(x[7])
            lists['Illiq'].append(x[8])
            lists['date'].append(dates[i])
            entries += 1
    # Create pandas data frame and to write out
    df = pd.DataFrame({
        "date": lists['date'],
        "dependent": lists['dependent'],
        "AbnPctOld": lists['AbnPctOld'],
        "Stories": lists['Stories'],
        "AbnStories": lists['AbnStories'],
        "Terms": lists['Terms'],
        "MCap": lists['MCap'],
        "BM": lists['BM'],
        "AbnRet": lists['AbnRet'],
        "AbnVol": lists['AbnVol'],
        "AbnVolitility": lists['AbnVolitility'],
        "Illiq": lists['Illiq']
    })
    print("ENTRIES: " + str(entries))
    print("COLUMNS: " + str(len(lists.keys())))
    if eight:
        # output fm_data_8_start_end.csv
        df.to_csv('fm_data_8_' + str(dates[0]) + "_" +
                  str(dates[len(dates) - 2]) + ".csv",
                  index=False)
    else:
        # output fm_data_9_start_end.csv
        df.to_csv('fm_data_9_' + str(dates[0]) + "_" +
                  str(dates[len(dates) - 2]) + ".csv",
                  index=False)
示例#5
0
import pandas as pd

mdb = d.AdjustableMeasuresDatabase("reduced_djn_data.csv")
new_name = "news_measures.csv"

# Create file and write header
f = open(new_name, "w+")
header = "DATE,TICKER,STORIES,TERMS,ABN_PCT_OLD,ABN_PCT_REC\n"
f.write(header)

# Will maintain date order
for tup in mdb.tdMap:
    line = tup[1] + "," + tup[0]
    line += "," + str(u.stories(tup[0], tup[1], mdb))
    line += "," + str(u.terms(tup[0], tup[1], mdb))
    line += "," + str(u.abnormalPercentageOld(tup[0], tup[1], mdb))
    line += "," + str(u.abnormalPercentageRecombinations(tup[0], tup[1], mdb))
    line += "\n"
    f.write(line)

f.close()

# Sort by date and ticker
df = pd.read_csv(new_name)
df.sort_values(by=["DATE", "TICKER"], inplace=True)
df.to_csv(new_name, index=False)