Пример #1
0
class XBRLExtractorTest(unittest.TestCase):
    def setUp(self):
        self.baseUrl = 'https://www.sec.gov/Archives/edgar/'
        self.parser = FileParser()
        self.xe = XBRLExtractor()

    def test_xbrl1(self):
        url = self.baseUrl + 'data/1005286/0001437749-15-017917.txt'
        text, code = self.parser._getFile(url)
        result, debtDisclosure = self.xe.processText(text)
        self.assertTrue(result)
        self.assertIsInstance(result[0], DebtLineItem)

    def test_xbrl2(self):
        url = self.baseUrl + 'data/815097/0000815097-16-000040.txt'
        text, code = self.parser._getFile(url)
        result, debtDisclosure = self.xe.processText(text)
        self.assertTrue(result)
        self.assertIsInstance(result[0], DebtLineItem)

    def test_xbrl3(self):
        url = self.baseUrl + 'data/63754/0000063754-16-000089.txt'
        text, code = self.parser._getFile(url)
        result, debtDisclosure = self.xe.processText(text)
        self.assertTrue(result)
        self.assertIsInstance(result[0], DebtLineItem)

    def test_longXbrl(self):
        url = self.baseUrl + 'data/1393612/0001193125-11-091525.txt'
        text, code = self.parser._getFile(url)
        result, debtDisclosure = self.xe.processText(text)
        self.assertTrue(result)
        self.assertIsInstance(result[0], DebtLineItem)
Пример #2
0
    def _runSingleProcess(self, urlGen, outputFile):
        '''
        Single process implementation.  Iterates through urls in the generator
        and builds result dataframe in memory.  Suitable for smaller data
        pulls.
        '''

        parser = FileParser()
        lineItems = []
        disclosures = []
        for (i, url) in enumerate(urlGen, 1):
            res, dd = parser.fileUrl2Result(url)
            lineItems.append(res._toDict())
            disclosures.append(dd._asdict())

            if (i % 25 == 0):
                edgarScraperLog.info("Scraped {} total files".format(i))

            if (i % 100 == 0):
                self._recordResults(lineItems, disclosures, outputFile)
                lineItems = []
                disclosures = []

        edgarScraperLog.info("Job Finished {} total files".format(i))

        if lineItems:
            self._recordResults(lineItems, disclosures, outputFile)

        if not outputFile:
            lineItemDf = pd.concat(self.lineItemBuffer)
            disclosureDf = pd.concat(self.disclosureBuffer)
            return (lineItemDf, disclosureDf)
Пример #3
0
    def test_xbrl(self):
        url = self.baseUrl + 'data/730255/0001206774-15-003149.txt'
        testIndexResult = IndexResult(0, 'test', 'test', 'test', url)
        parser = FileParser()
        result, dd = parser.fileUrl2Result(testIndexResult)

        self.assertIsNotNone(result.lineItems)
        self.assertIsInstance(result, ResultSet)
Пример #4
0
    def test_text(self):
        url = self.baseUrl + 'data/1005406/0000898430-96-001816.txt'
        testIndexResult = IndexResult(2, 'test', 'test', 'test', url)
        parser = FileParser()
        result, dd = parser.fileUrl2Result(testIndexResult)

        self.assertIsNotNone(result.lineItems)
        self.assertIsInstance(result, ResultSet)
        self.assertIsNone(dd.TEXT)
Пример #5
0
    def test_html(self):
        url = self.baseUrl + 'data/1144215/0001193125-10-001623.txt'
        testIndexResult = IndexResult(1, 'test', 'test', 'test', url)
        parser = FileParser()
        result, dd = parser.fileUrl2Result(testIndexResult)

        self.assertIsNotNone(result.lineItems)
        self.assertIsInstance(result, ResultSet)
        self.assertIsNone(dd.TEXT)
Пример #6
0
def _mpJob(urlGen, file):
    '''
    Multiprocessing task must be defined at top-level.  Each worker process
    iterates over a slice of the url-generator.  Urls are processed and
    results are returned to the main processes for aggregation.
    '''
    parser = FileParser()
    resDictList = []
    ddList = []

    for url in urlGen:

        (res, dd) = parser.fileUrl2Result(url)
        resDictList.append(res._toDict())
        ddList.append(dd._asdict())

    if url.seqNo % 100 == 0:
        edgarScraperLog.info('finished consuming file {}'.format(url.seqNo))

    return (resDictList, ddList)
Пример #7
0
class HTMLExtractorTest(unittest.TestCase):
    def setUp(self):
        self.baseUrl = 'https://www.sec.gov/Archives/edgar/'
        self.parser = FileParser()
        self.he = HTMLExtractor()

    def test_htmlWithTables(self):
        url = self.baseUrl + 'data/1012956/0001193125-03-015316.txt'
        text, code = self.parser._getFile(url)
        result = self.he.processText(text)
        self.assertTrue(result)
        self.assertIsInstance(result[0], DebtLineItem)

    def test_htmlWithTables2(self):
        url = self.baseUrl + 'data/225261/0001047469-03-023135.txt'
        text, code = self.parser._getFile(url)
        result = self.he.processText(text)
        self.assertTrue(result)
        self.assertIsInstance(result[0], DebtLineItem)

    def test_htmlWithTables3(self):
        url = self.baseUrl + 'data/1368055/0001140361-10-002064.txt'
        text, code = self.parser._getFile(url)
        result = self.he.processText(text)
        self.assertTrue(result)
        self.assertIsInstance(result[0], DebtLineItem)

    def test_htmlWOTables(self):
        url = self.baseUrl + 'data/915337/0001002334-10-000006.txt'
        text, code = self.parser._getFile(url)
        result = self.he.processText(text)
        self.assertTrue(result)
        self.assertIsInstance(result[0], DebtLineItem)

    def test_htmlWOTables2(self):
        url = self.baseUrl + 'data/1439981/0001137050-10-000003.txt'
        text, code = self.parser._getFile(url)
        result = self.he.processText(text)
        self.assertTrue(result)
        self.assertIsInstance(result[0], DebtLineItem)

    def test_htmlWithZeros(self):
        url = self.baseUrl + 'data/1468679/0001271008-11-000006.txt'
        text, code = self.parser._getFile(url)
        result = self.he.processText(text)
        self.assertTrue(result)
        self.assertIsInstance(result[0], DebtLineItem)
Пример #8
0
 def setUp(self):
     self.baseUrl = 'https://www.sec.gov/Archives/edgar/'
     self.parser = FileParser()
     self.he = HTMLExtractor()