示例#1
0
def svmDictionaries():
    loughranDict = fd.loadFile(drive+'Loughran_McDonald_dict.pckl')
    benchNNDict = fd.loadFile(drive+'dictionary_benchNN.pckl')
    classNNDict = fd.loadFile(drive+'dictionary_classificationNN.pckl')
    regresNNDict = fd.loadFile(drive+'dictionary_regressionNN.pckl')
    dictionaries = [benchNNDict, classNNDict, regresNNDict]
    
    dictionaries = fSVM.filterDicts(loughranDict, dictionaries, 0.4)
    return dictionaries
示例#2
0
def getScores(filename, dictionaries, dict_names):
    dataset = fd.loadFile(filename)
    data = dict()
    for d in dict_names:
        data[d] = []
    for item in dataset:
        x1 = np.concatenate(np.row_stack(item[7])[:,0:2]).tolist()
        n = 6-len(x1)
        for i in range(n):
            x1.insert(0, 0)
        if item[7][0][0] != 0:
            vol_change = (item[6][0]-item[7][0][0])/item[7][0][0]
            if vol_change >=0:
                y = [item[4][0],item[5],vol_change,1]
            else:
                y = [item[4][0],item[5],vol_change,0]
    
            for i in range(len(dictionaries)):
                d = dictionaries[i]
                d_name = dict_names[i]
                if d_name == 'Classification' or d_name == 'Regression':
                    x2 = getOmega2(item[-1], d)
                else:
                    x2 = getOmega(item[-1], d)
                if len(x2) > 0:
                    info = [item[0], item[1]]
                    x = np.array(info+x1+x2+y)
                    data[d_name].append(x)
    return data
示例#3
0
def getDescriptives():
    n10KP = 0
    n10KN = 0
    n10QP = 0
    n10QN = 0
    words10KP = 0
    words10KN = 0
    words10QP = 0
    words10QN = 0
    for year in range(2000,2019):
        print(year)
        filename = loc+str(year)+"10X_final.pckl"
        dataset = fd.loadFile(filename)
        for item in dataset:
            nWords = f10X.wordCount(item[5])
            text = open(item[2],"r").read()
            f_type = f10X.getFileType(text)
            y = item[4]
            if f_type == "10-K":
                if y >= 0:
                    n10KP+=1
                    words10KP+=nWords
                else:
                    n10KN+=1
                    words10KN+=nWords
            elif f_type == "10-Q":
                if y >= 0:
                    n10QP+=1
                    words10QP+=nWords
                else:
                    n10QN+=1
                    words10QN+=nWords
    descriptives = [n10KP, n10KN, n10QP, n10QN, words10KP, words10KN, words10QP, words10QN]
    row_names = ['# of positive 10Ks','# of negative 10Ks','# of positive 10Qs','# of negative 10Qs','# of words in positive 10Ks','# of words in negative 10Ks','# of words in positive 10Qs','# of words in negative 10Qs']
    return pd.DataFrame(descriptives, index = row_names)
示例#4
0
def returnDictionary(dictionary, filename):
    yearly_list = fd.loadFile(filename)
    CIKs =[]
    for k in yearly_list:
        CIKs.append(k[1])
        text = cleanText(k[-1])
        count = collections.Counter(text)
        for key, value in count.items():
            if key not in dictionary:
                dictionary[key]['pos'] = rd.randint(10, 50)/100
                dictionary[key]['neg'] = rd.randint(10, 50)/100
                dictionary[key]['mp'] = 0
                dictionary[key]['vp'] = 0
                dictionary[key]['mn'] = 0
                dictionary[key]['vn'] = 0
                dictionary[key]['freq'] = value
                dictionary[key]['ndocs'] = 1
            else:
                dictionary[key]['freq'] = dictionary[key]['freq'] + value
                dictionary[key]['ndocs'] += 1
    return dictionary, CIKs
##############################################################################
##############################################################################
示例#5
0
import numpy as np
import re
import pandas as pd
import functions10X as f10X
import functionsData as fd
import functionsNN as fNN
import functionsSVM as fSVM
import random as rd
import collections
import time
drive = "/Volumes/LaCie/Data/"
search = []
wc = fd.loadFile(drive + 'length.pckl')
pos10K = 0
neg10K = 0
pos10Q = 0
neg10Q = 0
wcPos10K = 0
wcNeg10K = 0
wcPos10Q = 0
wcNeg10Q = 0
for year in range(2000, 2015):
    print(year)
    f1 = drive + str(year) + "10X_final.pckl"
    dataset = fd.loadFile(f1)
    for item in dataset:
        wc_cik = wc[(wc[:, 2] == item[1])]
        wc_i = wc_cik[(wc_cik[:, 0] == item[0])]
        count = sum(wc_i[:, 1].astype(int))
        if item[5] >= 0:
            if '10-K' in item[2]:
示例#6
0
            #N = (batch_mat.sum(1)).mean()
            #batch_mat1 = batch_mat/N
            batch_dictDF = pd.DataFrame(batch_dict)
            m = [batch_dictDF.loc['mp'], batch_dictDF.loc['mn'], m_coef]
            v = [batch_dictDF.loc['vp'], batch_dictDF.loc['vn'], v_coef]
            y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1, W)
            loss.append(fNN.crossEntropyLoss(y, y_hat))
            batch_dictDF, W, m_coef, v_coef = backPropagation(
                batch_dictDF, batch_mat, y, y_hat, W, X, m, v, N)
            d = batch_dictDF.to_dict()
            dictionary.update(d)
            end2 = time.time()
    return loss, W


#dictionary = fd.loadFile(drive+'dictionary_final.pckl')
#dictionary = fNN.initializeX(dictionary)
dictionary = fd.loadFile(drive + 'dictionary_benchNN.pckl')

W, m_coef, v_coef = initializeCoefficients()
batch_size, epochs = setHyperparameters()
loss = []
for year in range(2013, 2015):
    start = time.time()
    dataset = fd.loadFile(drive + str(year) + '10X_final.pckl')
    rd.shuffle(dataset)
    loss, W = runNeuralNetwork(dataset, W, m_coef, v_coef)
    end = time.time()
    print(end - start)
fd.saveFile(dictionary, drive + 'dictionary_benchNN.pckl')
示例#7
0
import time
from sklearn import svm
from sklearn import metrics

drive = '/Volumes/LaCie/Data/'
def svmDictionaries():
    loughranDict = fd.loadFile(drive+'Loughran_McDonald_dict.pckl')
    benchNNDict = fd.loadFile(drive+'dictionary_benchNN.pckl')
    classNNDict = fd.loadFile(drive+'dictionary_classificationNN.pckl')
    regresNNDict = fd.loadFile(drive+'dictionary_regressionNN.pckl')
    dictionaries = [benchNNDict, classNNDict, regresNNDict]
    
    dictionaries = fSVM.filterDicts(loughranDict, dictionaries, 0.4)
    return dictionaries

dictionaries = fd.loadFile(drive+'SVM_dictionaries.pckl')
dict_names = ['Loughran', 'Benchmark', 'Classification', 'Regression']

def SVMDataset(dictionaries, dict_names):
    train, test = dict(),dict()
    for d in dict_names:
        train[d] = []
        test[d] = []
    for year in range(2000,2015):
        print(year)
        filename = drive+str(year)+'10X_final.pckl'
        X = fSVM.getScores(filename, dictionaries, dict_names)
        for d in dict_names:
            train[d].extend(X[d])
    for year in range(2015,2019):
        print(year)
示例#8
0
            N = 0
            #N = (batch_mat.sum(1)).mean()
            #batch_mat1 = batch_mat/N
            batch_mat1 = fNN.tfidf2(batch_mat)
            batch_dictDF = pd.DataFrame(batch_dict)
            m = [batch_dictDF.loc['mp'],batch_dictDF.loc['mn'], Ms]
            v = [batch_dictDF.loc['vp'],batch_dictDF.loc['vn'], Vs]
            y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1, coefficients)
            loss.append(fNN.MSELoss(y, y_hat))
            batch_dictDF, coefficients, Ms, Vs = backPropagation(batch_dictDF, batch_mat, y, y_hat, coefficients, X, m, v, N)
            d = batch_dictDF.to_dict()
            dictionary.update(d)
            end2 = time.time()
    return loss, coefficients
time.sleep(15000)
dictionary = fd.loadFile(drive+'dictionary_filtered.pckl')
dictionary = fNN.initializeX(dictionary)
#dictionary = fd.loadFile(drive+'dictionary_regressionNN.pckl')
n_docs = 276880

coefficients, Ms, Vs = initializeCoefficients()
batch_size, epochs = setHyperparameters()
loss = []
for year in range(2013,2015):
    start = time.time()
    dataset = fd.loadFile(drive+str(year)+'10X_final.pckl')
    rd.shuffle(dataset)
    loss, coefficients = runNeuralNetwork(dataset, coefficients, Ms, Vs)
    end = time.time()
    print(end-start)
fd.saveFile(dictionary, drive+'dictionary_regressionNN.pckl')
示例#9
0
            m = [batch_dictDF.loc['mp'], batch_dictDF.loc['mn'], Ms]
            v = [batch_dictDF.loc['vp'], batch_dictDF.loc['vn'], Vs]
            y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1,
                                             coefficients)
            loss.append(fNN.crossEntropyLoss(y, y_hat))
            batch_dictDF, coefficients, Ms, Vs = backPropagation(
                batch_dictDF, batch_mat, y, y_hat, coefficients, X, m, v, N)
            d = batch_dictDF.to_dict()
            dictionary.update(d)
            end2 = time.time()
    return loss, coefficients


#dictionary = fd.loadFile(drive+'dictionary_filtered.pckl')
#dictionary = fNN.initializeX(dictionary)
dictionary = fd.loadFile(drive + 'dictionary_classificationNN.pckl')
n_docs = 276880

coefficients, Ms, Vs = initializeCoefficients()
batch_size, epochs = setHyperparameters()
loss = []
for year in range(2007, 2015):
    start = time.time()
    dataset = fd.loadFile(drive + str(year) + '10X_final.pckl')
    rd.shuffle(dataset)
    loss, coefficients = runNeuralNetwork(dataset, coefficients, Ms, Vs)
    end = time.time()
    print(end - start)
fd.saveFile(dictionary, drive + 'dictionary_classificationNN.pckl')
fd.saveFile(coefficients, drive + 'coefficients_classificationNN.pckl')
fd.saveFile(Ms, drive + 'Ms_classificationNN.pckl')
示例#10
0
import functionsData as fd
import pandas as pd
descr_ciks = ['0000072971', '0001403161', '0000875320', '0001318605', \
              '0000078003', '0001021860', '0000879101', '0000019617', \
              '0000886982', '0000037996', '0000034088', '0000712515', \
              '0000732717', '0000320193', '0000789019', '0000106640', \
              '0001418091', '0001283699', '0000092380', '0001039684']
drive = '/Volumes/LaCie/Data/'
fullCIKs = fd.loadFile(drive+'CIKs_final.pckl')
desc = fd.ciksDescriptives(descr_ciks)

name_xlsx = drive+'descriptivesCIK.xlsx'
writer = pd.ExcelWriter(name_xlsx,engine='xlsxwriter')
workbook=writer.book
for key in desc.keys():
    worksheet=workbook.add_worksheet(key)
    writer.sheets[key] = worksheet
    
    worksheet.write_string(0, 0, 'General Descriptives')
    desc[key]['Descriptives'].to_excel(writer,sheet_name=key,startrow=1 , startcol=0)
    
    worksheet.write_string(desc[key]['Descriptives'].shape[0] + 4, 0, 'Quantiles')
    desc[key]['Quantiles'].to_excel(writer,sheet_name=key,startrow=desc[key]['Descriptives'].shape[0] + 5, startcol=0)
    
    worksheet.write_string(desc[key]['Descriptives'].shape[0] + 5 + desc[key]['Quantiles'].shape[0] + 4, 0, 'Periods')
    desc[key]['Periods'].to_excel(writer,sheet_name=key,startrow=desc[key]['Descriptives'].shape[0] + 5 + desc[key]['Quantiles'].shape[0] + 5, startcol=0)
writer.save()