Пример #1
0
    def _runSingleProcess(self, urlGen, outputFile):
        '''
        Single process implementation.  Iterates through urls in the generator
        and builds result dataframe in memory.  Suitable for smaller data
        pulls.
        '''

        parser = FileParser()
        lineItems = []
        disclosures = []
        for (i, url) in enumerate(urlGen, 1):
            res, dd = parser.fileUrl2Result(url)
            lineItems.append(res._toDict())
            disclosures.append(dd._asdict())

            if (i % 25 == 0):
                edgarScraperLog.info("Scraped {} total files".format(i))

            if (i % 100 == 0):
                self._recordResults(lineItems, disclosures, outputFile)
                lineItems = []
                disclosures = []

        edgarScraperLog.info("Job Finished {} total files".format(i))

        if lineItems:
            self._recordResults(lineItems, disclosures, outputFile)

        if not outputFile:
            lineItemDf = pd.concat(self.lineItemBuffer)
            disclosureDf = pd.concat(self.disclosureBuffer)
            return (lineItemDf, disclosureDf)
Пример #2
0
    def test_xbrl(self):
        url = self.baseUrl + 'data/730255/0001206774-15-003149.txt'
        testIndexResult = IndexResult(0, 'test', 'test', 'test', url)
        parser = FileParser()
        result, dd = parser.fileUrl2Result(testIndexResult)

        self.assertIsNotNone(result.lineItems)
        self.assertIsInstance(result, ResultSet)
Пример #3
0
    def test_text(self):
        url = self.baseUrl + 'data/1005406/0000898430-96-001816.txt'
        testIndexResult = IndexResult(2, 'test', 'test', 'test', url)
        parser = FileParser()
        result, dd = parser.fileUrl2Result(testIndexResult)

        self.assertIsNotNone(result.lineItems)
        self.assertIsInstance(result, ResultSet)
        self.assertIsNone(dd.TEXT)
Пример #4
0
    def test_html(self):
        url = self.baseUrl + 'data/1144215/0001193125-10-001623.txt'
        testIndexResult = IndexResult(1, 'test', 'test', 'test', url)
        parser = FileParser()
        result, dd = parser.fileUrl2Result(testIndexResult)

        self.assertIsNotNone(result.lineItems)
        self.assertIsInstance(result, ResultSet)
        self.assertIsNone(dd.TEXT)
Пример #5
0
def _mpJob(urlGen, file):
    '''
    Multiprocessing task must be defined at top-level.  Each worker process
    iterates over a slice of the url-generator.  Urls are processed and
    results are returned to the main processes for aggregation.
    '''
    parser = FileParser()
    resDictList = []
    ddList = []

    for url in urlGen:

        (res, dd) = parser.fileUrl2Result(url)
        resDictList.append(res._toDict())
        ddList.append(dd._asdict())

    if url.seqNo % 100 == 0:
        edgarScraperLog.info('finished consuming file {}'.format(url.seqNo))

    return (resDictList, ddList)
Пример #6
0
 def setUp(self):
     self.baseUrl = 'https://www.sec.gov/Archives/edgar/'
     self.parser = FileParser()
     self.he = HTMLExtractor()