def get_filings_by_company(company, type, n):
    tree = company.get_all_filings(filing_type=type)
    docs_lxml = Company.get_documents(tree, no_of_documents=n)
    docs_data = Company.get_documents(tree, no_of_documents=n, as_documents=True)
    if not isinstance(docs_lxml, list):
        docs_lxml = [docs_lxml]
        docs_data = [docs_data]

    return (docs_lxml, docs_data)
示例#2
0
def get_10K_doc_raw(name, cik):
    """
    Get the latest 10-K filing document for a given company
    using the edgar package
    """
    company = Company(name, cik)
    # tree = company.get_all_filings(filing_type="10-K")
    # docs = Company.get_documents(tree, no_of_documents=1)
    docs = company.get_10Ks(no_of_documents=1)
    return docs
示例#3
0
def findWord(comp,cik): 
    try: 
        company = Company(comp,cik) 
        doc = company.get_10K() 
        text = TXTML.parse_full_10K(doc) 
        #print(text) 
        if (re.search('blockchain', text , re.IGNORECASE)): 
            return("exists") 
        else : 
            return("dosenot") 
    except: 
        return("No 10-k") 
示例#4
0
def get_filing_metadata(context, name: str, cik: str, filing: str,
                        no_filings: int):
    comp = Company(name, cik)
    tree = comp.get_all_filings(filing)
    docs = comp.get_documents(tree, no_filings, True)

    filings = []

    #TODO #38 change return method to yield AssetMaterialization()
    for document in docs:
        filings.append[clean_filings(document, cik, filing)]

    context.log.info(log_assert_type(filings, dict))
    return filings
def pull_10K(company_name, company_id):
    company = Company(company_name, company_id)
    tree = company.get_all_filings(filing_type="10-K")
    pre_time = time.time()
    offset = random.randint(1, 25)  #seconds
    if pre_time + offset > time.time():
        docs = Company.get_documents(tree, no_of_documents=3)
        pre_time = time.time()
    text_l = []
    for i in range(len(docs)):
        try:
            text = TXTML.parse_full_10K(docs[i])
            text_l.append(text)
        except IndexError:
            pass
    return text_l
示例#6
0
def file_date(com, cik, no_docs):
    """
    This function is to pull only the filing date
    Serves as the date of measurement for analyzing returns.
    """
    company = Company(com, cik, no_docs)
    tree = company.get_all_filings(filing_type="10-K")
    docs = Company.get_documents(tree,
                                 no_of_documents=no_docs,
                                 as_documents=True)
    dates = []
    for x in range(no_docs):
        doc = docs[x]
        dates.append(doc.content['Filing Date'])

    return dates
def get_company_by_cik(cik):
    cik_table = str(cik)
    length = len(cik_table)
    cik = "0" * (10 - length) + cik_table
    name = edgar.get_company_name_by_cik(cik)
    company = Company(name, cik)

    return company
示例#8
0
def pull_10K(name, company_id):
    '''
    we use this function to perform the get filings.
    we need to run this function and iterarte over our
    list of tickers. Each ticker will get parsed and
    collected into a dataframe.
    '''
    company = Company(name, company_id)
    tree = company.get_all_filings(filing_type="10-K")

    docs = Company.get_documents(tree, no_of_documents=6)
    # print("checkpoint: retrieving documents...")
    text_l = []
    for i in range(len(docs)):
        try:
            text = TXTML.parse_full_10K(docs[i])
            text_l.append(text)
        except IndexError:
            pass
    return text_l
def main():
    # establish a list of companies to extract data from
    company_list = [('AMAZON COM INC', '0001018724'),
                    ('Apple Inc.', '0000320193')]

    # iterate through the companies, calling the get_xbrl function on each
    xbrl_files = [get_xbrl(Company(pair[0], pair[1])) for pair in company_list]

    # fill pandas with the segment data
    segment_df = xbrl_to_df(xbrl_files[0])

    segment_df.to_csv(Path.cwd() / 'SegmentData.csv', index=False)
示例#10
0
def has_ex_10(edgar):
    company_df = init_edgar_df(edgar)
    company_df['ex-10'] = False

    for _, row in company_df.drop_duplicates(
            subset='cik', keep='first').iloc[4000:7000].iterrows():
        cik = row['cik']

        # initialize a Company instance
        company = Company(name=edgar.get_company_name_by_cik(cik), cik=cik)

        # get all the "EX-10" type documents from the company's 10K
        documents = company.get_document_type_from_10K('EX-10',
                                                       no_of_documents=1)

        if documents:
            company_df.at[_, 'ex-10'] = True

    ex_10_df = company_df[company_df['ex-10'] == True]
    ex_10_df.to_csv(
        '/Users/sorenlittle/PycharmProjects/edgar_spacy_training/ex_10_df/ex_10_df_4000_7000.csv'
    )
示例#11
0
from edgar import Company, XBRL, XBRLElement, TXTML, Edgar, Document
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
import sys

# company = Company("INTERNATIONAL BUSINESS MACHINES CORP", "0000051143")
# company2 = Company("twitter", "0001418091")
# company3 = Company("Oracle Corp", "0001341439")
company4 = Company("GOOGLE INC", "0001288776")

# edgar = Edgar()
# possible_companies = edgar.find_company_name("Cisco System")
#
# print(possible_companies)

doc = company4.get_10K()
text = TXTML.parse_full_10K(doc)

print('1')

f = open("text2.txt", "w+")
f.write(text)
f.close()

# f = open('text.txt', 'r')
# for line in f:
#     print(line)
#     print()
示例#12
0
from edgar import Company, TXTML, XBRLElement, XBRL, Edgar

db = Edgar()
comp_name = 'TESLA, INC.'

company = Company(comp_name, db.all_companies_dict[comp_name])
'''
company = Company("Oracle Corp", "0001341439")
tree = company.get_all_filings(filing_type = "10-K")
docs = Company.get_documents(tree, no_of_documents=5)
print (docs)

text = TXTML.parse_full_10K(docs[0])
#print (text)
#company = edgar.Company(Ticker,"21344")
#print company


company = Company("Oracle Corp", "0001341439")
results = company.get_data_files_from_10K("EX-101.INS", isxml=True)
xbrl = XBRL(results[0])
element = XBRLElement(xbrl.relevant_children_parsed[15]).to_dict()#// returns a dictionary of name, value, and schemaRef
print(element)
'''
示例#13
0
    def search_company(self, name, cik,
                       filing_type, filing_subtype, no_of_entries, filing_date_before, filing_pattern,
                       filing_rsrc_cache):
        base_url = self.aconfig['args'].endpoint
        acquirePatterns = OrderedDict()

        if len(filing_pattern) == 0 and not filing_rsrc_cache:
            print("Ambiguous options: no pattern search: (-P [-P] , and no download of resources: -d. Choose one mode")
            return

        for pattern in filing_pattern:
            acquirePatterns[pattern] = re.compile(pattern)

        self.alogger.debug("Name:{0} CIK:{1} Filing:{2} Subtype:{3}".format(name, cik, filing_type, filing_subtype))
        company = Company(name, cik)

        print("Filings endpoint:", company.get_filings_url())
        tree = company.get_all_filings(filing_type=filing_type,
                                       no_of_entries=no_of_entries, prior_to=filing_date_before)

        url_groups = company._group_document_type(tree, filing_type)
        result = OrderedDict()
        for url_group in url_groups:
            for url in url_group:
                url = base_url + url
                self.alogger.debug("In Content page: {0} ".format(url))
                content_page = Company.get_request(url)
                try:
                    table = content_page.find_class("tableFile")[0]
                    for row in table.getchildren():

                        # Match on 4th column of the row `Type`
                        if filing_subtype in row.getchildren()[3].text:
                            self.alogger.debug("Subtype found: {0}".format(row.getchildren()[3].text))
                            href = row.getchildren()[2].getchildren()[0].attrib["href"]
                            href_txt = row.getchildren()[2].getchildren()[0].text_content()

                            if href and not href_txt:
                                self.alogger.debug(" but no link for the resource posted. skipping")
                                continue

                            # SEC XRBL. Remove that cruft, get raw document if applicable
                            href = href.replace("/ix?doc=", "")
                            href = base_url + href

                            self.alogger.debug("Processing resource: {0}".format(href))
                            # Fetch the filing doc and process
                            if filing_rsrc_cache:
                                rsrc_cache_path = urlparse(href).path.strip("/")
                                rsrc_cache_dir = os.path.dirname(rsrc_cache_path)
                                r = requests.get(href)
                                self.alogger.debug("Making repository structure")
                                os.makedirs(rsrc_cache_dir, exist_ok=True)
                                print("Storing {} from {} locally: {}".format(href_txt, href, rsrc_cache_path))
                                with open(rsrc_cache_path, 'wb') as f:
                                    f.write(r.content)
                            else:
                                print("Working on {} ...".format(href))
                                doc = Company.get_request(href)
                                tree_str = str(etree.tostring(doc), 'utf-8')
                                tree_str_text = html2text.html2text(tree_str)
                                result[href] = tree_str_text

                except IndexError as ie:
                    pass

        if not filing_rsrc_cache and len(filing_pattern) != 0:
            self.alogger.debug("Matched filing types count: {} ".format(len(result)))

            self.alogger.debug("Performing pattern matching")
            for filing_resource, filing_text in result.items():
                for pattern, cpattern in acquirePatterns.items():
                    if re.search(cpattern, filing_text):
                        self.alogger.debug("Pattern Matches: {0}".format(filing_resource))
                        self.search_string(filing_text, 1, 1, pattern)
示例#14
0
from edgar import Company, TXTML
import re
import pandas as pd

df = pd.read_excel(r'companylist.xls')

expense_estimates = []
for i in df.index:
    print(expense_estimates)
    CIK_string = df['CIK'][i].split("; ")
    print(df['Company Name'][i])
    company = Company("df['Company Name'][i]", CIK_string[0])
    try:
        doc = company.get_10K()
        text = TXTML.parse_full_10K(doc)
    except IndexError:
        expense_estimates.append(float("NaN"))
        continue
    if not ('hipping' in text):
        expense_estimates.append(float("NaN"))
        continue
    matches = [m.start() for m in re.finditer('hipping', text)]
    #print(matches)
    string = ""
    est_available = False
    for i in matches:
        if '$' in text[i:i + 50]:
            string = text[i:i + 200]
            est_available = True
            break
    if not est_available:
示例#15
0
dfmap['id'] = dfmap['id'].astype(str).str.zfill(10)

#read the source list of tickers
dft = pd.read_csv('et.csv', header=None)
dft.columns = ['ticker']

#join with the sec ticker master file to add the 'id' column
dft = dft.merge(dfmap, on='ticker', how='inner')
dft = dft.drop_duplicates()

dfsftcols = ['ticker', 'earn_datetime']
dfSECFileTimes = pd.DataFrame(columns=dfsftcols)

for row in dft.itertuples():
    print(row.ticker + ' ' + row.id)
    company = Company(row.ticker, row.id)
    tree = company.get_all_filings(filing_type="8-K")
    hrefs = tree.xpath('//*[@id="documentsbutton"]')
    descs = tree.xpath('//div[4]/div[4]//td[3]')

    for i in zip(descs, hrefs):
        if i[0].text_content().strip().find(' 2.02') > -1:
            lnk = 'https://www.sec.gov' + i[1].get('href')
            con = Documents(lnk).content
            if con['Accepted'][:4] == '2014':
                break
            sleep(0.2)
            dfSECFileTimes = dfSECFileTimes.append(
                pd.DataFrame([[row.ticker, con['Accepted']]],
                             columns=dfsftcols))
            print(" ".join([row.ticker, con['Accepted'], lnk]))
def get_edgar_filing_text(comp_tuples, f_type, n_docs, file_dir, dates_dir):
    '''
    ---------------------------------------------------------------------------
    Scraping function to get the text from company filings from EDGAR. 
    
    Inputs:
        
    - comp_tuples : A list with pairwise company tuples. The first element must 
                    be a string with the company name as listed on the EDGAR
                    database. The second element must be a string with the CIK
                    identifier. See get_sp500_tickers_cik_industry(argin) to 
                    easily get the tuples from the S&P500 listed firms.
                
    - f_type      : A string with the filing type.
    
    - n_docs      : Number of filings to be fetched, in descending order, i.e.
                    n_docs = 3 will fetch the three newest filings of type 
                    f_type. As a double integer.
                    
    - file_dir    : The master directory where all filings are to be saved. As
                    a string.
                    
    - dates_dir   : The master directory where all filing dates are saved. If 
                    a directory is missing, the function will instead scrape 
                    the dates using get_edgar_filing_date(argin), and create a 
                    new folder with the dates.
                    
    Example: 
        
    comp_tuples = [['APPLE INC'     , '0000320193'],
                   ['MCDONALDS CORP', '0000063908'],
                   ['MICROSOFT CORP', '0000789019']]

    f_type      = '10-K'     [Or '10-Q']
    
    n_docs      = 3
    
    file_dir    = 'Users/Tobias/Dropbox/textfolder/Text Reports U.S.'
    
    dates_dir   = 'Users/Tobias/Dropbox/textfolder/Dates Reports U.S' 
                   
    get_edgar_filing_text(comp_tuples,f_type,n_docs,file_dir,dates_dir)
    ---------------------------------------------------------------------------
    '''

    print('Fetching data...')
    print('-' * 80 + '\n')
    for idx, comp_tuple in enumerate(comp_tuples):
        comp = edgar.Company(comp_tuple[0], comp_tuple[1])
        tree = comp.get_all_filings(filing_type=f_type)
        docs = Company.get_documents(tree, no_of_documents=n_docs)

        # Now that we have the filings, find get the filing dates for each
        # document. If we have them already, then great, let's load them. If
        # not, call get_edgar_filing_date to get them for this company.
        dir = os.path.join(dates_dir, f_type, comp_tuple[0])
        if not os.path.exists(dir):
            print(('\nCannot find the dates for ' + comp_tuple[0] +
                   '. Attempting to download them...'))
            get_edgar_filing_date([comp_tuple], f_type, dates_dir)
        else:
            os.chdir(dir)
            if '.' in comp_tuple[0][-1]:
                comp_tuple[0] = comp_tuple[0][:-1]
            with open(comp_tuple[0] + '.pickle', 'rb') as file:
                dates = pickle.load(file)
                dates = dates[:n_docs]
        dir = os.path.join(file_dir + '\\' + f_type + '\\' + comp_tuple[0])
        if not os.path.exists(file_dir + '\\' + f_type + '\\' + comp_tuple[0]):
            os.makedirs(file_dir + '\\' + f_type + '\\' + comp_tuple[0])
        os.chdir(file_dir + '\\' + f_type + '\\' + comp_tuple[0])
        for date, doc in zip(dates, docs):
            f = open(date.replace('.pickle', '') + '.txt',
                     'w',
                     encoding='utf8')
            f.write(str(doc))
            f.close()
        mes = ('Status: ' + str(int(
            (idx + 1) / len(comp_tuples) * 100)) + '% done')
        sys.stdout.write('\r' + mes)