def test_filing_get_urls_returns_single_list_of_urls(self, monkeypatch): monkeypatch.setattr(_CIKValidator, "get_ciks", MockCIKValidatorMultipleCIKs.get_ciks) # Use same response for each request monkeypatch.setattr(NetworkClient, "get_response", MockSingleCIKFiling) ciks = CIKLookup(['aapl', 'msft', 'amzn']) f = Filing(ciks, FilingType.FILING_10Q, count=3) assert all(len(f.get_urls().get(key)) == 3 for key in f.get_urls().keys())
def test_filing_returns_correct_number_of_urls(self, count, mock_cik_validator_get_multiple_ciks, mock_single_cik_filing): # Uses same response for filing links (will all be filings for aapl) f = Filing(cik_lookup=['aapl', 'msft', 'amzn'], filing_type=FilingType.FILING_10Q, count=count, client=NetworkClient(batch_size=10)) assert all(len(f.get_urls().get(key)) == count for key in f.get_urls().keys())
def test_filing_get_urls_returns_single_list_of_urls( self, mock_cik_validator_get_multiple_ciks, mock_single_cik_filing): # Uses same response for filing links (will all be filings for aapl) f = Filing(cik_lookup=["aapl", "msft", "amzn"], filing_type=FilingType.FILING_10Q, count=5) assert all( len(f.get_urls().get(key)) == 5 for key in f.get_urls().keys())
def test_filing_returns_correct_number_of_urls(self, monkeypatch, count): monkeypatch.setattr(_CIKValidator, "get_ciks", MockCIKValidatorMultipleCIKs.get_ciks) # Use same response for each request monkeypatch.setattr(NetworkClient, "get_response", MockSingleCIKFiling) f = Filing(cik_lookup=['aapl', 'msft', 'amzn'], filing_type=FilingType.FILING_10Q, count=count, client=NetworkClient(batch_size=10)) assert all( len(f.get_urls().get(key)) == count for key in f.get_urls().keys())
def test_txt_urls(self, monkeypatch): aapl = Filing(cik='aapl', filing_type=FilingType.FILING_10Q, count=10) monkeypatch.setattr(CIKValidator, "get_ciks", MockCIKValidatorGetCIKs.get_ciks) monkeypatch.setattr(NetworkClient, "get_response", MockSingleCIKFiling) first_txt_url = aapl.get_urls()[0] assert first_txt_url.split('.')[-1] == 'txt'
def test_txt_urls(self, mock_cik_validator_get_single_cik, mock_single_cik_filing): aapl = Filing(cik_lookup="aapl", filing_type=FilingType.FILING_10Q, count=10) first_txt_url = aapl.get_urls()["aapl"][0] assert first_txt_url.split(".")[-1] == "txt"
def test_count_returns_exact(self, monkeypatch): aapl = Filing(cik_lookup='aapl', filing_type=FilingType.FILING_10Q, count=10) monkeypatch.setattr(_CIKValidator, "get_ciks", MockCIKValidatorGetCIKs.get_ciks) monkeypatch.setattr(NetworkClient, "get_response", MockSingleCIKFiling) urls = aapl.get_urls()['aapl'] if len(urls) != aapl.client.count: raise AssertionError("""Count should return exact number of filings. Got {0}, but expected {1} URLs.""".format( urls, aapl.client.count))
def test_count_returns_exact(self, mock_cik_validator_get_single_cik, mock_single_cik_filing): count = 10 aapl = Filing(cik_lookup='aapl', filing_type=FilingType.FILING_10Q, count=count) urls = aapl.get_urls()['aapl'] if len(urls) != count: raise AssertionError("""Count should return exact number of filings. Got {0}, but expected {1} URLs.""".format( urls, count))
def __get_data(self, cik, filing_type, data_set): result = pd.DataFrame() filing_word_count = dict() my_filings = Filing(cik=str(cik), filing_type=filing_type) path = f'../data/company_filings/{cik}_{filing_type.value}/' if not os.path.exists(path): try: print( f'Fetching data for cik={cik}, filing_type={filing_type}') my_filings.save(path) except: try: if os.path.exists(path): shutil.rmtree(path) except OSError as e: print("Error: %s : %s" % (path, e.strerror)) else: print(f'Skipping data fetching. Using cache at {path}') for subdir, dirs, files in os.walk(path): for file in files: file_metadata = self.__get_file_metadata(f'{subdir}/{file}') for url in my_filings.get_urls(): if url.rsplit('/')[-1].strip() == file: file_metadata['url'] = url break assert len( file_metadata ) == 8, "Could not get all relevant metadata: %r" % file_metadata if file_metadata['year'] < 2007 or \ (file_metadata['form_type'] != '10-K' and file_metadata['form_type'] != '10-Q'): print( f'Skipping file. year={file_metadata["year"]} form_type={file_metadata["form_type"]}' ) continue violations_in_file, local_word_count = self.__get_violations_for_file( f'{subdir}/{file}') file_info = { 'cik': cik, 'firm name': file_metadata['company_name'], 'firm address': file_metadata['address'], 'zip code': str(file_metadata['zip']), 'year': file_metadata['year'], 'quarter': file_metadata['quarter'] if filing_type is FilingType.FILING_10Q else None, 'url': file_metadata['url'], 'filing type': filing_type.value, 'dataset': data_set, 'has covenant violation': 0 if violations_in_file == 0 else 1, 'total violations': violations_in_file } result = result.append(pd.DataFrame(file_info, index=[0])) for word in local_word_count: if word in filing_word_count: filing_word_count[word] = filing_word_count[ word] + local_word_count[word] else: filing_word_count[word] = local_word_count[word] return result, filing_word_count
def test_txt_urls(self, mock_cik_validator_get_single_cik, mock_single_cik_filing): aapl = Filing(cik_lookup='aapl', filing_type=FilingType.FILING_10Q, count=10) first_txt_url = aapl.get_urls()['aapl'][0] assert first_txt_url.split('.')[-1] == 'txt'