def __init__(self, url, company=None): self.url = url # made this company instead of symbol since not all edgar companies are publicly traded self.company = company response = GetRequest(url).response text = response.text self.text = text # print('Processing SGML at ' + url) dtd = DTD() sgml = Sgml(text, dtd) self.sgml = sgml # {filename:Document} self.documents = {} for document_raw in sgml.map[dtd.sec_document.tag][dtd.document.tag]: document = Document(document_raw) self.documents[document.filename] = document if dtd.acceptance_datetime.tag in sgml.map[dtd.sec_document.tag][dtd.sec_header.tag]: acceptance_datetime_element = sgml.map[dtd.sec_document.tag][dtd.sec_header.tag][ dtd.acceptance_datetime.tag] acceptance_datetime_text = acceptance_datetime_element[:8] # YYYYMMDDhhmmss, the rest is junk else: acceptance_datetime_element = sgml.map[dtd.sec_document.tag][dtd.sec_header.tag] acceptance_datetime_text = acceptance_datetime_element.split("\n", maxsplit=1)[0].split(" : ")[1] # not concerned with time/timezones self.date_filed = datetime.strptime(acceptance_datetime_text, '%Y%m%d')
def get_index_json(year='', quarter=''): ''' Returns json of index.json year and quarter are defaulted to '', but can be replaced with an item.href from index.json ''' url = FULL_INDEX_URL+year+quarter+INDEX_JSON # print('getting data at '+url) response = GetRequest(url).response text = response.text json_text = json.loads(text) #print(text) #print(json['directory']['item'][0]['href']) return json_text
def __init__(self, url, company=None): self.url = url # made this company instead of symbol since not all edgar companies are publicly traded self.company = company response = GetRequest(url).response text = response.text self.text = text print('Processing SGML at ' + url) dtd = DTD() sgml = Sgml(text, dtd) self.sgml = sgml # {filename:Document} self.documents = {} print('hi') print(dtd.sec_document.tag) print(dtd.document.tag) print('hi2') counter = 0 for document_raw in sgml.map[dtd.sec_document.tag][dtd.document.tag]: counter += 1 if counter == 16: print(document_raw) #pprint(sgml.map) #print(sgml.map[dtd.sec_document.tag][dtd.document.tag]) #print('********************************') #print(type(document_raw)) #print(document_raw) document = Document(document_raw) self.documents[document.filename] = document acceptance_datetime_element = sgml.map[dtd.sec_document.tag][ dtd.sec_header.tag][dtd.acceptance_datetime.tag] acceptance_datetime_text = acceptance_datetime_element[: 8] # YYYYMMDDhhmmss, the rest is junk # not concerned with time/timezones self.date_filed = datetime.strptime(acceptance_datetime_text, '%Y%m%d')
def _get_filing_info(cik='', forms=[], year='', quarter=''): ''' Return a List of FilingInfo If forms are specified, only filings with the given value will be returned e.g. 10-K, 10-Q, 3, 4, 5 year and quarter are defaulted to '', but can be replaced with an item.href from index.json ''' def _get_raw_data(row): ''' Returns a list from a string (master idx row) that is delimited by "|" Format of master.idx file is as follows: CIK|Company Name|Form Type|Date Filed|Filename -------------------------------------------------------------------------------- 1000209|MEDALLION FINANCIAL CORP|8-K|2019-01-08|edgar/data/1000209/0001193125-19-004285.txt 1000209|MEDALLION FINANCIAL CORP|8-K|2019-01-11|edgar/data/1000209/0001193125-19-007413.txt 1000228|HENRY SCHEIN INC|425|2019-01-07|edgar/data/1000228/0001193125-19-003023.txt ''' return row.split('|') def _add_filing_info(filing_infos, data, forms): ''' Adds a FilingInfo from data to a list :param data: list of length 5 with the following data indices: 0=cik, 1=company, 2=form, 3=date_filed, 4=file_name ''' if len(data) == 5 and (forms == [] or data[2] in forms): # Form Type should among forms or forms be default (all) filing_infos.append(FilingInfo( data[1], # Company Name data[2], # Form Type data[0], # CIK data[3], # Date Filed data[4].strip() # File Name )) for form in forms: if form not in SUPPORTED_FORMS: raise InvalidInputException('{} is not a supported form'.format(form)) # using master.idx so it's sorted by cik and we can use binary search url = '{}{}{}{}'.format(FULL_INDEX_URL, year, quarter, MASTER_IDX) print('getting {} filing info from {}'.format(forms, url)) response = GetRequest(url).response text = response.text # print(text) rows = text.split('\n') data_rows = rows[11:] filing_infos = [] if cik != '': # binary search to get company's filing info start = 0 end = len(data_rows) while start < end: mid = (start+end)//2 data = _get_raw_data(data_rows[mid]) # comparisons are done as strings, same as ordering in master.idx # e.g. 11 > 100 if data[0] == cik: # matched cik _add_filing_info(filing_infos, data, forms) # get all before and after (there can be multiple) # go backwards to get those before index = mid - 1 data = _get_raw_data(data_rows[index]) while data[0] == cik and index >= 0: _add_filing_info(filing_infos, data, forms) index -= 1 data = _get_raw_data(data_rows[index]) # after index = mid + 1 data = _get_raw_data(data_rows[index]) while data[0] == cik and index < len(data_rows): _add_filing_info(filing_infos, data, forms) index += 1 data = _get_raw_data(data_rows[index]) break elif data[0] < cik: start = mid + 1 else: end = mid - 1 else: # go through all for row in data_rows: data = _get_raw_data(row) _add_filing_info(filing_infos, data, forms) return filing_infos
def _get_financial_data_lite(self, statement_short_names, get_all): ''' Returns financial data used for processing 10-Q and 10-K documents ''' financial_data = [] # if a FilingSummary.xml file does not exist... return False if 'FilingSummary' not in self.documents_lite[FILING_SUMMARY_FILE]: print('No Filing Summary exists') return False filingSummaryReports = self.documents_lite[FILING_SUMMARY_FILE][ 'FilingSummary']['MyReports']['Report'] #pprint(len(filingSummaryReports)) #pprint(filingSummaryReports[0]['ShortName']) statementCounter = 0 statement_list = self._get_statement_lite(statement_short_names) for statement in statement_list: statementCounter += 1 short_name = statement[0] filename = statement[1] for report in filingSummaryReports: if short_name == report['ShortName'].lower(): url = self.url + '/' + filename #print(url) response = GetRequest(url).response #print(response) text = response.text dtd = DTD() sgml = Sgml(text, dtd) self.sgml = sgml import json sgmlString = json.dumps(sgml.map) sgmlString = sgmlString[16:] sgmlString = sgmlString[:-2] sgml.map = json.loads(sgmlString) self.documents[filename] = Document(sgml.map) financial_html_text = self.documents[ filename].doc_text.data financial_report = get_financial_report_lite( self.company, financial_html_text) # if get_financial_report_lite has an error, it will return as False # return as False again, pass it up the stack if financial_report == False and statementCounter == len( statement_list): return False elif financial_report == False and statementCounter != len( statement_list): continue if get_all: financial_data.append(financial_report) else: return financial_report return financial_data