Python loadFile示例，functionsData.loadFile Python示例

示例#1

0

显示文件

文件： SVMs.py 项目： tgarutti/Fin-SentiLex

def svmDictionaries():
    loughranDict = fd.loadFile(drive+'Loughran_McDonald_dict.pckl')
    benchNNDict = fd.loadFile(drive+'dictionary_benchNN.pckl')
    classNNDict = fd.loadFile(drive+'dictionary_classificationNN.pckl')
    regresNNDict = fd.loadFile(drive+'dictionary_regressionNN.pckl')
    dictionaries = [benchNNDict, classNNDict, regresNNDict]
    
    dictionaries = fSVM.filterDicts(loughranDict, dictionaries, 0.4)
    return dictionaries

示例#2

0

显示文件

文件： functionsSVM.py 项目： tgarutti/Fin-SentiLex

def getScores(filename, dictionaries, dict_names):
    dataset = fd.loadFile(filename)
    data = dict()
    for d in dict_names:
        data[d] = []
    for item in dataset:
        x1 = np.concatenate(np.row_stack(item[7])[:,0:2]).tolist()
        n = 6-len(x1)
        for i in range(n):
            x1.insert(0, 0)
        if item[7][0][0] != 0:
            vol_change = (item[6][0]-item[7][0][0])/item[7][0][0]
            if vol_change >=0:
                y = [item[4][0],item[5],vol_change,1]
            else:
                y = [item[4][0],item[5],vol_change,0]
    
            for i in range(len(dictionaries)):
                d = dictionaries[i]
                d_name = dict_names[i]
                if d_name == 'Classification' or d_name == 'Regression':
                    x2 = getOmega2(item[-1], d)
                else:
                    x2 = getOmega(item[-1], d)
                if len(x2) > 0:
                    info = [item[0], item[1]]
                    x = np.array(info+x1+x2+y)
                    data[d_name].append(x)
    return data

示例#3

0

显示文件

def getDescriptives():
    n10KP = 0
    n10KN = 0
    n10QP = 0
    n10QN = 0
    words10KP = 0
    words10KN = 0
    words10QP = 0
    words10QN = 0
    for year in range(2000,2019):
        print(year)
        filename = loc+str(year)+"10X_final.pckl"
        dataset = fd.loadFile(filename)
        for item in dataset:
            nWords = f10X.wordCount(item[5])
            text = open(item[2],"r").read()
            f_type = f10X.getFileType(text)
            y = item[4]
            if f_type == "10-K":
                if y >= 0:
                    n10KP+=1
                    words10KP+=nWords
                else:
                    n10KN+=1
                    words10KN+=nWords
            elif f_type == "10-Q":
                if y >= 0:
                    n10QP+=1
                    words10QP+=nWords
                else:
                    n10QN+=1
                    words10QN+=nWords
    descriptives = [n10KP, n10KN, n10QP, n10QN, words10KP, words10KN, words10QP, words10QN]
    row_names = ['# of positive 10Ks','# of negative 10Ks','# of positive 10Qs','# of negative 10Qs','# of words in positive 10Ks','# of words in negative 10Ks','# of words in positive 10Qs','# of words in negative 10Qs']
    return pd.DataFrame(descriptives, index = row_names)

示例#4

0

显示文件

文件： functions10X.py 项目： tgarutti/master_thesis

def returnDictionary(dictionary, filename):
    yearly_list = fd.loadFile(filename)
    CIKs =[]
    for k in yearly_list:
        CIKs.append(k[1])
        text = cleanText(k[-1])
        count = collections.Counter(text)
        for key, value in count.items():
            if key not in dictionary:
                dictionary[key]['pos'] = rd.randint(10, 50)/100
                dictionary[key]['neg'] = rd.randint(10, 50)/100
                dictionary[key]['mp'] = 0
                dictionary[key]['vp'] = 0
                dictionary[key]['mn'] = 0
                dictionary[key]['vn'] = 0
                dictionary[key]['freq'] = value
                dictionary[key]['ndocs'] = 1
            else:
                dictionary[key]['freq'] = dictionary[key]['freq'] + value
                dictionary[key]['ndocs'] += 1
    return dictionary, CIKs
##############################################################################
##############################################################################

示例#5

0

显示文件

文件： searchFiles.py 项目： tgarutti/master_thesis

import numpy as np
import re
import pandas as pd
import functions10X as f10X
import functionsData as fd
import functionsNN as fNN
import functionsSVM as fSVM
import random as rd
import collections
import time
drive = "/Volumes/LaCie/Data/"
search = []
wc = fd.loadFile(drive + 'length.pckl')
pos10K = 0
neg10K = 0
pos10Q = 0
neg10Q = 0
wcPos10K = 0
wcNeg10K = 0
wcPos10Q = 0
wcNeg10Q = 0
for year in range(2000, 2015):
    print(year)
    f1 = drive + str(year) + "10X_final.pckl"
    dataset = fd.loadFile(f1)
    for item in dataset:
        wc_cik = wc[(wc[:, 2] == item[1])]
        wc_i = wc_cik[(wc_cik[:, 0] == item[0])]
        count = sum(wc_i[:, 1].astype(int))
        if item[5] >= 0:
            if '10-K' in item[2]:

示例#6

0

显示文件

            #N = (batch_mat.sum(1)).mean()
            #batch_mat1 = batch_mat/N
            batch_dictDF = pd.DataFrame(batch_dict)
            m = [batch_dictDF.loc['mp'], batch_dictDF.loc['mn'], m_coef]
            v = [batch_dictDF.loc['vp'], batch_dictDF.loc['vn'], v_coef]
            y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1, W)
            loss.append(fNN.crossEntropyLoss(y, y_hat))
            batch_dictDF, W, m_coef, v_coef = backPropagation(
                batch_dictDF, batch_mat, y, y_hat, W, X, m, v, N)
            d = batch_dictDF.to_dict()
            dictionary.update(d)
            end2 = time.time()
    return loss, W


#dictionary = fd.loadFile(drive+'dictionary_final.pckl')
#dictionary = fNN.initializeX(dictionary)
dictionary = fd.loadFile(drive + 'dictionary_benchNN.pckl')

W, m_coef, v_coef = initializeCoefficients()
batch_size, epochs = setHyperparameters()
loss = []
for year in range(2013, 2015):
    start = time.time()
    dataset = fd.loadFile(drive + str(year) + '10X_final.pckl')
    rd.shuffle(dataset)
    loss, W = runNeuralNetwork(dataset, W, m_coef, v_coef)
    end = time.time()
    print(end - start)
fd.saveFile(dictionary, drive + 'dictionary_benchNN.pckl')

示例#7

0

显示文件

文件： SVMs.py 项目： tgarutti/Fin-SentiLex

import time
from sklearn import svm
from sklearn import metrics

drive = '/Volumes/LaCie/Data/'
def svmDictionaries():
    loughranDict = fd.loadFile(drive+'Loughran_McDonald_dict.pckl')
    benchNNDict = fd.loadFile(drive+'dictionary_benchNN.pckl')
    classNNDict = fd.loadFile(drive+'dictionary_classificationNN.pckl')
    regresNNDict = fd.loadFile(drive+'dictionary_regressionNN.pckl')
    dictionaries = [benchNNDict, classNNDict, regresNNDict]
    
    dictionaries = fSVM.filterDicts(loughranDict, dictionaries, 0.4)
    return dictionaries

dictionaries = fd.loadFile(drive+'SVM_dictionaries.pckl')
dict_names = ['Loughran', 'Benchmark', 'Classification', 'Regression']

def SVMDataset(dictionaries, dict_names):
    train, test = dict(),dict()
    for d in dict_names:
        train[d] = []
        test[d] = []
    for year in range(2000,2015):
        print(year)
        filename = drive+str(year)+'10X_final.pckl'
        X = fSVM.getScores(filename, dictionaries, dict_names)
        for d in dict_names:
            train[d].extend(X[d])
    for year in range(2015,2019):
        print(year)

示例#8

0

显示文件

            N = 0
            #N = (batch_mat.sum(1)).mean()
            #batch_mat1 = batch_mat/N
            batch_mat1 = fNN.tfidf2(batch_mat)
            batch_dictDF = pd.DataFrame(batch_dict)
            m = [batch_dictDF.loc['mp'],batch_dictDF.loc['mn'], Ms]
            v = [batch_dictDF.loc['vp'],batch_dictDF.loc['vn'], Vs]
            y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1, coefficients)
            loss.append(fNN.MSELoss(y, y_hat))
            batch_dictDF, coefficients, Ms, Vs = backPropagation(batch_dictDF, batch_mat, y, y_hat, coefficients, X, m, v, N)
            d = batch_dictDF.to_dict()
            dictionary.update(d)
            end2 = time.time()
    return loss, coefficients
time.sleep(15000)
dictionary = fd.loadFile(drive+'dictionary_filtered.pckl')
dictionary = fNN.initializeX(dictionary)
#dictionary = fd.loadFile(drive+'dictionary_regressionNN.pckl')
n_docs = 276880

coefficients, Ms, Vs = initializeCoefficients()
batch_size, epochs = setHyperparameters()
loss = []
for year in range(2013,2015):
    start = time.time()
    dataset = fd.loadFile(drive+str(year)+'10X_final.pckl')
    rd.shuffle(dataset)
    loss, coefficients = runNeuralNetwork(dataset, coefficients, Ms, Vs)
    end = time.time()
    print(end-start)
fd.saveFile(dictionary, drive+'dictionary_regressionNN.pckl')

示例#9

0

显示文件

文件： classificationNN.py 项目： tgarutti/Fin-SentiLex

            m = [batch_dictDF.loc['mp'], batch_dictDF.loc['mn'], Ms]
            v = [batch_dictDF.loc['vp'], batch_dictDF.loc['vn'], Vs]
            y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1,
                                             coefficients)
            loss.append(fNN.crossEntropyLoss(y, y_hat))
            batch_dictDF, coefficients, Ms, Vs = backPropagation(
                batch_dictDF, batch_mat, y, y_hat, coefficients, X, m, v, N)
            d = batch_dictDF.to_dict()
            dictionary.update(d)
            end2 = time.time()
    return loss, coefficients


#dictionary = fd.loadFile(drive+'dictionary_filtered.pckl')
#dictionary = fNN.initializeX(dictionary)
dictionary = fd.loadFile(drive + 'dictionary_classificationNN.pckl')
n_docs = 276880

coefficients, Ms, Vs = initializeCoefficients()
batch_size, epochs = setHyperparameters()
loss = []
for year in range(2007, 2015):
    start = time.time()
    dataset = fd.loadFile(drive + str(year) + '10X_final.pckl')
    rd.shuffle(dataset)
    loss, coefficients = runNeuralNetwork(dataset, coefficients, Ms, Vs)
    end = time.time()
    print(end - start)
fd.saveFile(dictionary, drive + 'dictionary_classificationNN.pckl')
fd.saveFile(coefficients, drive + 'coefficients_classificationNN.pckl')
fd.saveFile(Ms, drive + 'Ms_classificationNN.pckl')

示例#10

0

显示文件

文件： teststuff.py 项目： tgarutti/master_thesis

import functionsData as fd
import pandas as pd
descr_ciks = ['0000072971', '0001403161', '0000875320', '0001318605', \
              '0000078003', '0001021860', '0000879101', '0000019617', \
              '0000886982', '0000037996', '0000034088', '0000712515', \
              '0000732717', '0000320193', '0000789019', '0000106640', \
              '0001418091', '0001283699', '0000092380', '0001039684']
drive = '/Volumes/LaCie/Data/'
fullCIKs = fd.loadFile(drive+'CIKs_final.pckl')
desc = fd.ciksDescriptives(descr_ciks)

name_xlsx = drive+'descriptivesCIK.xlsx'
writer = pd.ExcelWriter(name_xlsx,engine='xlsxwriter')
workbook=writer.book
for key in desc.keys():
    worksheet=workbook.add_worksheet(key)
    writer.sheets[key] = worksheet
    
    worksheet.write_string(0, 0, 'General Descriptives')
    desc[key]['Descriptives'].to_excel(writer,sheet_name=key,startrow=1 , startcol=0)
    
    worksheet.write_string(desc[key]['Descriptives'].shape[0] + 4, 0, 'Quantiles')
    desc[key]['Quantiles'].to_excel(writer,sheet_name=key,startrow=desc[key]['Descriptives'].shape[0] + 5, startcol=0)
    
    worksheet.write_string(desc[key]['Descriptives'].shape[0] + 5 + desc[key]['Quantiles'].shape[0] + 4, 0, 'Periods')
    desc[key]['Periods'].to_excel(writer,sheet_name=key,startrow=desc[key]['Descriptives'].shape[0] + 5 + desc[key]['Quantiles'].shape[0] + 5, startcol=0)
writer.save()