示例#1
0
def extractSectionFromExistingFilings(numThreads):
    filename = '../check.txt'
    completedFilename = 'extraction_log.txt'
    f = open(completedFilename, 'r')
    completedLines = f.readlines()
    f.close()
    completed = [line.strip() for line in completedLines]

    with open(filename) as check:
        lines = check.readlines()
        readLines = [[line.split('\t')[0].strip(), [line.split('\t')[6].strip()], \
            [line.split('\t')[5].strip()], [line.split('\t')[6].strip()], '10-K'] for line in lines]

    sectionsToGet = []
    for line in reversed(readLines):
        companyCode, filingURLList, docNameList, indexURLList, filingType = line
        if docNameList[0] not in completed:
            sectionsToGet.append(line)
        else:
            print 'Already downloaded ', docNameList[0]

    if numThreads > 1:
        print 'THREADED'
        results = calculateParallel(sectionsToGet, extractSingleSection,
                                    numThreads)
    else:
        print 'NONTHREADED'
        seccrawler = SecCrawler()

        for lineElem in sectionsToGet:
            companyCode, filingURLList, docNameList, indexURLList, filingType = lineElem
            if '.txt' not in filingURLList[0]:
                seccrawler.save_in_directory(companyCode, filingURLList,
                                             docNameList, indexURLList,
                                             filingType)
                f = open(completedFilename, 'a+')
                f.write(docNameList[0] + '\n')
                f.close()
示例#2
0
def extractSingleSection(inputs):
    companyCode, filingURLList, docNameList, indexURLList, filingType = inputs
    logString = 'Skipping text file.'
    if '.txt' not in filingURLList[0]:
        t1 = time.time()
        seccrawler = SecCrawler()
        try:
            seccrawler.save_in_directory(companyCode, filingURLList,
                                         docNameList, indexURLList, filingType)
            # Put all exception text into an exception and raise that

            t2 = time.time()

            completedFilename = 'extraction_log.txt'
            f = open(completedFilename, 'a+')
            f.write(docNameList[0] + '\n')
            f.close()
            logString = "Total Time taken for " + companyCode + "sections: " + str(
                t2 - t1)

        except:
            raise Exception("".join(
                traceback.format_exception(*sys.exc_info())))
    return logString