class XBRLExtractorTest(unittest.TestCase): def setUp(self): self.baseUrl = 'https://www.sec.gov/Archives/edgar/' self.parser = FileParser() self.xe = XBRLExtractor() def test_xbrl1(self): url = self.baseUrl + 'data/1005286/0001437749-15-017917.txt' text, code = self.parser._getFile(url) result, debtDisclosure = self.xe.processText(text) self.assertTrue(result) self.assertIsInstance(result[0], DebtLineItem) def test_xbrl2(self): url = self.baseUrl + 'data/815097/0000815097-16-000040.txt' text, code = self.parser._getFile(url) result, debtDisclosure = self.xe.processText(text) self.assertTrue(result) self.assertIsInstance(result[0], DebtLineItem) def test_xbrl3(self): url = self.baseUrl + 'data/63754/0000063754-16-000089.txt' text, code = self.parser._getFile(url) result, debtDisclosure = self.xe.processText(text) self.assertTrue(result) self.assertIsInstance(result[0], DebtLineItem) def test_longXbrl(self): url = self.baseUrl + 'data/1393612/0001193125-11-091525.txt' text, code = self.parser._getFile(url) result, debtDisclosure = self.xe.processText(text) self.assertTrue(result) self.assertIsInstance(result[0], DebtLineItem)
def _runSingleProcess(self, urlGen, outputFile): ''' Single process implementation. Iterates through urls in the generator and builds result dataframe in memory. Suitable for smaller data pulls. ''' parser = FileParser() lineItems = [] disclosures = [] for (i, url) in enumerate(urlGen, 1): res, dd = parser.fileUrl2Result(url) lineItems.append(res._toDict()) disclosures.append(dd._asdict()) if (i % 25 == 0): edgarScraperLog.info("Scraped {} total files".format(i)) if (i % 100 == 0): self._recordResults(lineItems, disclosures, outputFile) lineItems = [] disclosures = [] edgarScraperLog.info("Job Finished {} total files".format(i)) if lineItems: self._recordResults(lineItems, disclosures, outputFile) if not outputFile: lineItemDf = pd.concat(self.lineItemBuffer) disclosureDf = pd.concat(self.disclosureBuffer) return (lineItemDf, disclosureDf)
def test_xbrl(self): url = self.baseUrl + 'data/730255/0001206774-15-003149.txt' testIndexResult = IndexResult(0, 'test', 'test', 'test', url) parser = FileParser() result, dd = parser.fileUrl2Result(testIndexResult) self.assertIsNotNone(result.lineItems) self.assertIsInstance(result, ResultSet)
def test_text(self): url = self.baseUrl + 'data/1005406/0000898430-96-001816.txt' testIndexResult = IndexResult(2, 'test', 'test', 'test', url) parser = FileParser() result, dd = parser.fileUrl2Result(testIndexResult) self.assertIsNotNone(result.lineItems) self.assertIsInstance(result, ResultSet) self.assertIsNone(dd.TEXT)
def test_html(self): url = self.baseUrl + 'data/1144215/0001193125-10-001623.txt' testIndexResult = IndexResult(1, 'test', 'test', 'test', url) parser = FileParser() result, dd = parser.fileUrl2Result(testIndexResult) self.assertIsNotNone(result.lineItems) self.assertIsInstance(result, ResultSet) self.assertIsNone(dd.TEXT)
def _mpJob(urlGen, file): ''' Multiprocessing task must be defined at top-level. Each worker process iterates over a slice of the url-generator. Urls are processed and results are returned to the main processes for aggregation. ''' parser = FileParser() resDictList = [] ddList = [] for url in urlGen: (res, dd) = parser.fileUrl2Result(url) resDictList.append(res._toDict()) ddList.append(dd._asdict()) if url.seqNo % 100 == 0: edgarScraperLog.info('finished consuming file {}'.format(url.seqNo)) return (resDictList, ddList)
class HTMLExtractorTest(unittest.TestCase): def setUp(self): self.baseUrl = 'https://www.sec.gov/Archives/edgar/' self.parser = FileParser() self.he = HTMLExtractor() def test_htmlWithTables(self): url = self.baseUrl + 'data/1012956/0001193125-03-015316.txt' text, code = self.parser._getFile(url) result = self.he.processText(text) self.assertTrue(result) self.assertIsInstance(result[0], DebtLineItem) def test_htmlWithTables2(self): url = self.baseUrl + 'data/225261/0001047469-03-023135.txt' text, code = self.parser._getFile(url) result = self.he.processText(text) self.assertTrue(result) self.assertIsInstance(result[0], DebtLineItem) def test_htmlWithTables3(self): url = self.baseUrl + 'data/1368055/0001140361-10-002064.txt' text, code = self.parser._getFile(url) result = self.he.processText(text) self.assertTrue(result) self.assertIsInstance(result[0], DebtLineItem) def test_htmlWOTables(self): url = self.baseUrl + 'data/915337/0001002334-10-000006.txt' text, code = self.parser._getFile(url) result = self.he.processText(text) self.assertTrue(result) self.assertIsInstance(result[0], DebtLineItem) def test_htmlWOTables2(self): url = self.baseUrl + 'data/1439981/0001137050-10-000003.txt' text, code = self.parser._getFile(url) result = self.he.processText(text) self.assertTrue(result) self.assertIsInstance(result[0], DebtLineItem) def test_htmlWithZeros(self): url = self.baseUrl + 'data/1468679/0001271008-11-000006.txt' text, code = self.parser._getFile(url) result = self.he.processText(text) self.assertTrue(result) self.assertIsInstance(result[0], DebtLineItem)
def setUp(self): self.baseUrl = 'https://www.sec.gov/Archives/edgar/' self.parser = FileParser() self.he = HTMLExtractor()