Exemplo n.º 1
0
    def __init__(self, url, company=None):
        self.url = url
        # made this company instead of symbol since not all edgar companies are publicly traded
        self.company = company

        response = GetRequest(url).response
        text = response.text

        self.text = text

        # print('Processing SGML at ' + url)

        dtd = DTD()
        sgml = Sgml(text, dtd)

        self.sgml = sgml

        # {filename:Document}
        self.documents = {}
        for document_raw in sgml.map[dtd.sec_document.tag][dtd.document.tag]:
            document = Document(document_raw)
            self.documents[document.filename] = document
        if dtd.acceptance_datetime.tag in sgml.map[dtd.sec_document.tag][dtd.sec_header.tag]:
            acceptance_datetime_element = sgml.map[dtd.sec_document.tag][dtd.sec_header.tag][
                dtd.acceptance_datetime.tag]
            acceptance_datetime_text = acceptance_datetime_element[:8]  # YYYYMMDDhhmmss, the rest is junk
        else:
            acceptance_datetime_element = sgml.map[dtd.sec_document.tag][dtd.sec_header.tag]
            acceptance_datetime_text = acceptance_datetime_element.split("\n", maxsplit=1)[0].split(" : ")[1]

        # not concerned with time/timezones
        self.date_filed = datetime.strptime(acceptance_datetime_text, '%Y%m%d')
Exemplo n.º 2
0
def get_index_json(year='', quarter=''):
    '''
    Returns json of index.json
        year and quarter are defaulted to '', but can be replaced with an item.href
        from index.json
    '''
    url = FULL_INDEX_URL+year+quarter+INDEX_JSON
    # print('getting data at '+url)

    response = GetRequest(url).response
    text = response.text

    json_text = json.loads(text)
    #print(text)
    #print(json['directory']['item'][0]['href'])
    return json_text
Exemplo n.º 3
0
    def __init__(self, url, company=None):
        self.url = url
        # made this company instead of symbol since not all edgar companies are publicly traded
        self.company = company

        response = GetRequest(url).response
        text = response.text

        self.text = text

        print('Processing SGML at ' + url)

        dtd = DTD()
        sgml = Sgml(text, dtd)

        self.sgml = sgml

        # {filename:Document}
        self.documents = {}
        print('hi')
        print(dtd.sec_document.tag)
        print(dtd.document.tag)
        print('hi2')
        counter = 0
        for document_raw in sgml.map[dtd.sec_document.tag][dtd.document.tag]:
            counter += 1
            if counter == 16:
                print(document_raw)
            #pprint(sgml.map)
            #print(sgml.map[dtd.sec_document.tag][dtd.document.tag])
            #print('********************************')
            #print(type(document_raw))
            #print(document_raw)
            document = Document(document_raw)
            self.documents[document.filename] = document

        acceptance_datetime_element = sgml.map[dtd.sec_document.tag][
            dtd.sec_header.tag][dtd.acceptance_datetime.tag]
        acceptance_datetime_text = acceptance_datetime_element[:
                                                               8]  # YYYYMMDDhhmmss, the rest is junk
        # not concerned with time/timezones
        self.date_filed = datetime.strptime(acceptance_datetime_text, '%Y%m%d')
Exemplo n.º 4
0
def _get_filing_info(cik='', forms=[], year='', quarter=''):
    '''
    Return a List of FilingInfo
        If forms are specified, only filings with the given value will be returned
        e.g. 10-K, 10-Q, 3, 4, 5
        year and quarter are defaulted to '', but can be replaced with an item.href
        from index.json
    '''
    def _get_raw_data(row):
        '''
        Returns a list from a string (master idx row) that is delimited by "|"

        Format of master.idx file is as follows:

        CIK|Company Name|Form Type|Date Filed|Filename
        --------------------------------------------------------------------------------
        1000209|MEDALLION FINANCIAL CORP|8-K|2019-01-08|edgar/data/1000209/0001193125-19-004285.txt
        1000209|MEDALLION FINANCIAL CORP|8-K|2019-01-11|edgar/data/1000209/0001193125-19-007413.txt
        1000228|HENRY SCHEIN INC|425|2019-01-07|edgar/data/1000228/0001193125-19-003023.txt
        '''
        return row.split('|')

    def _add_filing_info(filing_infos, data, forms):
        '''
        Adds a FilingInfo from data to a list

        :param data: list of length 5 with the following data indices:
            0=cik, 1=company, 2=form, 3=date_filed, 4=file_name 
        '''
        if len(data) == 5 and (forms == [] or data[2] in forms):
            # Form Type should among forms or forms be default (all)
            filing_infos.append(FilingInfo(
                        data[1], # Company Name
                        data[2], # Form Type
                        data[0], # CIK
                        data[3], # Date Filed
                        data[4].strip() # File Name
                    ))

    for form in forms:
        if form not in SUPPORTED_FORMS:
            raise InvalidInputException('{} is not a supported form'.format(form))

    # using master.idx so it's sorted by cik and we can use binary search
    url = '{}{}{}{}'.format(FULL_INDEX_URL, year, quarter, MASTER_IDX)
    print('getting {} filing info from {}'.format(forms, url))

    response = GetRequest(url).response
    text = response.text
    # print(text)
    rows = text.split('\n')
    data_rows = rows[11:]

    filing_infos = []

    if cik != '':
        # binary search to get company's filing info
        start = 0
        end = len(data_rows)

        while start < end:
            mid = (start+end)//2
            data = _get_raw_data(data_rows[mid])

            # comparisons are done as strings, same as ordering in master.idx
            # e.g. 11 > 100
            if data[0] == cik:
                # matched cik
                _add_filing_info(filing_infos, data, forms)

                # get all before and after (there can be multiple)
                # go backwards to get those before
                index = mid - 1
                data = _get_raw_data(data_rows[index])
                while data[0] == cik and index >= 0:
                    _add_filing_info(filing_infos, data, forms)
                    index -= 1
                    data = _get_raw_data(data_rows[index])

                # after
                index = mid + 1
                data = _get_raw_data(data_rows[index])
                while data[0] == cik and index < len(data_rows):
                    _add_filing_info(filing_infos, data, forms)
                    index += 1
                    data = _get_raw_data(data_rows[index])

                break

            elif data[0] < cik:
                start = mid + 1
            else:
                end = mid - 1
    else:
        # go through all
        for row in data_rows:
            data = _get_raw_data(row)
            _add_filing_info(filing_infos, data, forms)


    return filing_infos
Exemplo n.º 5
0
    def _get_financial_data_lite(self, statement_short_names, get_all):
        '''
        Returns financial data used for processing 10-Q and 10-K documents
        '''
        financial_data = []

        # if a FilingSummary.xml file does not exist... return False
        if 'FilingSummary' not in self.documents_lite[FILING_SUMMARY_FILE]:
            print('No Filing Summary exists')
            return False

        filingSummaryReports = self.documents_lite[FILING_SUMMARY_FILE][
            'FilingSummary']['MyReports']['Report']
        #pprint(len(filingSummaryReports))
        #pprint(filingSummaryReports[0]['ShortName'])

        statementCounter = 0
        statement_list = self._get_statement_lite(statement_short_names)

        for statement in statement_list:
            statementCounter += 1
            short_name = statement[0]
            filename = statement[1]

            for report in filingSummaryReports:
                if short_name == report['ShortName'].lower():
                    url = self.url + '/' + filename
                    #print(url)
                    response = GetRequest(url).response
                    #print(response)
                    text = response.text

                    dtd = DTD()
                    sgml = Sgml(text, dtd)
                    self.sgml = sgml

                    import json
                    sgmlString = json.dumps(sgml.map)
                    sgmlString = sgmlString[16:]
                    sgmlString = sgmlString[:-2]
                    sgml.map = json.loads(sgmlString)

                    self.documents[filename] = Document(sgml.map)

                    financial_html_text = self.documents[
                        filename].doc_text.data

                    financial_report = get_financial_report_lite(
                        self.company, financial_html_text)
                    # if get_financial_report_lite has an error, it will return as False
                    # return as False again, pass it up the stack
                    if financial_report == False and statementCounter == len(
                            statement_list):
                        return False
                    elif financial_report == False and statementCounter != len(
                            statement_list):
                        continue

                    if get_all:
                        financial_data.append(financial_report)
                    else:
                        return financial_report

        return financial_data