def get_filings_by_company(company, type, n): tree = company.get_all_filings(filing_type=type) docs_lxml = Company.get_documents(tree, no_of_documents=n) docs_data = Company.get_documents(tree, no_of_documents=n, as_documents=True) if not isinstance(docs_lxml, list): docs_lxml = [docs_lxml] docs_data = [docs_data] return (docs_lxml, docs_data)
def get_10K_doc_raw(name, cik): """ Get the latest 10-K filing document for a given company using the edgar package """ company = Company(name, cik) # tree = company.get_all_filings(filing_type="10-K") # docs = Company.get_documents(tree, no_of_documents=1) docs = company.get_10Ks(no_of_documents=1) return docs
def findWord(comp,cik): try: company = Company(comp,cik) doc = company.get_10K() text = TXTML.parse_full_10K(doc) #print(text) if (re.search('blockchain', text , re.IGNORECASE)): return("exists") else : return("dosenot") except: return("No 10-k")
def get_filing_metadata(context, name: str, cik: str, filing: str, no_filings: int): comp = Company(name, cik) tree = comp.get_all_filings(filing) docs = comp.get_documents(tree, no_filings, True) filings = [] #TODO #38 change return method to yield AssetMaterialization() for document in docs: filings.append[clean_filings(document, cik, filing)] context.log.info(log_assert_type(filings, dict)) return filings
def pull_10K(company_name, company_id): company = Company(company_name, company_id) tree = company.get_all_filings(filing_type="10-K") pre_time = time.time() offset = random.randint(1, 25) #seconds if pre_time + offset > time.time(): docs = Company.get_documents(tree, no_of_documents=3) pre_time = time.time() text_l = [] for i in range(len(docs)): try: text = TXTML.parse_full_10K(docs[i]) text_l.append(text) except IndexError: pass return text_l
def file_date(com, cik, no_docs): """ This function is to pull only the filing date Serves as the date of measurement for analyzing returns. """ company = Company(com, cik, no_docs) tree = company.get_all_filings(filing_type="10-K") docs = Company.get_documents(tree, no_of_documents=no_docs, as_documents=True) dates = [] for x in range(no_docs): doc = docs[x] dates.append(doc.content['Filing Date']) return dates
def get_company_by_cik(cik): cik_table = str(cik) length = len(cik_table) cik = "0" * (10 - length) + cik_table name = edgar.get_company_name_by_cik(cik) company = Company(name, cik) return company
def pull_10K(name, company_id): ''' we use this function to perform the get filings. we need to run this function and iterarte over our list of tickers. Each ticker will get parsed and collected into a dataframe. ''' company = Company(name, company_id) tree = company.get_all_filings(filing_type="10-K") docs = Company.get_documents(tree, no_of_documents=6) # print("checkpoint: retrieving documents...") text_l = [] for i in range(len(docs)): try: text = TXTML.parse_full_10K(docs[i]) text_l.append(text) except IndexError: pass return text_l
def main(): # establish a list of companies to extract data from company_list = [('AMAZON COM INC', '0001018724'), ('Apple Inc.', '0000320193')] # iterate through the companies, calling the get_xbrl function on each xbrl_files = [get_xbrl(Company(pair[0], pair[1])) for pair in company_list] # fill pandas with the segment data segment_df = xbrl_to_df(xbrl_files[0]) segment_df.to_csv(Path.cwd() / 'SegmentData.csv', index=False)
def has_ex_10(edgar): company_df = init_edgar_df(edgar) company_df['ex-10'] = False for _, row in company_df.drop_duplicates( subset='cik', keep='first').iloc[4000:7000].iterrows(): cik = row['cik'] # initialize a Company instance company = Company(name=edgar.get_company_name_by_cik(cik), cik=cik) # get all the "EX-10" type documents from the company's 10K documents = company.get_document_type_from_10K('EX-10', no_of_documents=1) if documents: company_df.at[_, 'ex-10'] = True ex_10_df = company_df[company_df['ex-10'] == True] ex_10_df.to_csv( '/Users/sorenlittle/PycharmProjects/edgar_spacy_training/ex_10_df/ex_10_df_4000_7000.csv' )
from edgar import Company, XBRL, XBRLElement, TXTML, Edgar, Document from sumy.summarizers.text_rank import TextRankSummarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer import sys # company = Company("INTERNATIONAL BUSINESS MACHINES CORP", "0000051143") # company2 = Company("twitter", "0001418091") # company3 = Company("Oracle Corp", "0001341439") company4 = Company("GOOGLE INC", "0001288776") # edgar = Edgar() # possible_companies = edgar.find_company_name("Cisco System") # # print(possible_companies) doc = company4.get_10K() text = TXTML.parse_full_10K(doc) print('1') f = open("text2.txt", "w+") f.write(text) f.close() # f = open('text.txt', 'r') # for line in f: # print(line) # print()
from edgar import Company, TXTML, XBRLElement, XBRL, Edgar db = Edgar() comp_name = 'TESLA, INC.' company = Company(comp_name, db.all_companies_dict[comp_name]) ''' company = Company("Oracle Corp", "0001341439") tree = company.get_all_filings(filing_type = "10-K") docs = Company.get_documents(tree, no_of_documents=5) print (docs) text = TXTML.parse_full_10K(docs[0]) #print (text) #company = edgar.Company(Ticker,"21344") #print company company = Company("Oracle Corp", "0001341439") results = company.get_data_files_from_10K("EX-101.INS", isxml=True) xbrl = XBRL(results[0]) element = XBRLElement(xbrl.relevant_children_parsed[15]).to_dict()#// returns a dictionary of name, value, and schemaRef print(element) '''
def search_company(self, name, cik, filing_type, filing_subtype, no_of_entries, filing_date_before, filing_pattern, filing_rsrc_cache): base_url = self.aconfig['args'].endpoint acquirePatterns = OrderedDict() if len(filing_pattern) == 0 and not filing_rsrc_cache: print("Ambiguous options: no pattern search: (-P [-P] , and no download of resources: -d. Choose one mode") return for pattern in filing_pattern: acquirePatterns[pattern] = re.compile(pattern) self.alogger.debug("Name:{0} CIK:{1} Filing:{2} Subtype:{3}".format(name, cik, filing_type, filing_subtype)) company = Company(name, cik) print("Filings endpoint:", company.get_filings_url()) tree = company.get_all_filings(filing_type=filing_type, no_of_entries=no_of_entries, prior_to=filing_date_before) url_groups = company._group_document_type(tree, filing_type) result = OrderedDict() for url_group in url_groups: for url in url_group: url = base_url + url self.alogger.debug("In Content page: {0} ".format(url)) content_page = Company.get_request(url) try: table = content_page.find_class("tableFile")[0] for row in table.getchildren(): # Match on 4th column of the row `Type` if filing_subtype in row.getchildren()[3].text: self.alogger.debug("Subtype found: {0}".format(row.getchildren()[3].text)) href = row.getchildren()[2].getchildren()[0].attrib["href"] href_txt = row.getchildren()[2].getchildren()[0].text_content() if href and not href_txt: self.alogger.debug(" but no link for the resource posted. skipping") continue # SEC XRBL. Remove that cruft, get raw document if applicable href = href.replace("/ix?doc=", "") href = base_url + href self.alogger.debug("Processing resource: {0}".format(href)) # Fetch the filing doc and process if filing_rsrc_cache: rsrc_cache_path = urlparse(href).path.strip("/") rsrc_cache_dir = os.path.dirname(rsrc_cache_path) r = requests.get(href) self.alogger.debug("Making repository structure") os.makedirs(rsrc_cache_dir, exist_ok=True) print("Storing {} from {} locally: {}".format(href_txt, href, rsrc_cache_path)) with open(rsrc_cache_path, 'wb') as f: f.write(r.content) else: print("Working on {} ...".format(href)) doc = Company.get_request(href) tree_str = str(etree.tostring(doc), 'utf-8') tree_str_text = html2text.html2text(tree_str) result[href] = tree_str_text except IndexError as ie: pass if not filing_rsrc_cache and len(filing_pattern) != 0: self.alogger.debug("Matched filing types count: {} ".format(len(result))) self.alogger.debug("Performing pattern matching") for filing_resource, filing_text in result.items(): for pattern, cpattern in acquirePatterns.items(): if re.search(cpattern, filing_text): self.alogger.debug("Pattern Matches: {0}".format(filing_resource)) self.search_string(filing_text, 1, 1, pattern)
from edgar import Company, TXTML import re import pandas as pd df = pd.read_excel(r'companylist.xls') expense_estimates = [] for i in df.index: print(expense_estimates) CIK_string = df['CIK'][i].split("; ") print(df['Company Name'][i]) company = Company("df['Company Name'][i]", CIK_string[0]) try: doc = company.get_10K() text = TXTML.parse_full_10K(doc) except IndexError: expense_estimates.append(float("NaN")) continue if not ('hipping' in text): expense_estimates.append(float("NaN")) continue matches = [m.start() for m in re.finditer('hipping', text)] #print(matches) string = "" est_available = False for i in matches: if '$' in text[i:i + 50]: string = text[i:i + 200] est_available = True break if not est_available:
dfmap['id'] = dfmap['id'].astype(str).str.zfill(10) #read the source list of tickers dft = pd.read_csv('et.csv', header=None) dft.columns = ['ticker'] #join with the sec ticker master file to add the 'id' column dft = dft.merge(dfmap, on='ticker', how='inner') dft = dft.drop_duplicates() dfsftcols = ['ticker', 'earn_datetime'] dfSECFileTimes = pd.DataFrame(columns=dfsftcols) for row in dft.itertuples(): print(row.ticker + ' ' + row.id) company = Company(row.ticker, row.id) tree = company.get_all_filings(filing_type="8-K") hrefs = tree.xpath('//*[@id="documentsbutton"]') descs = tree.xpath('//div[4]/div[4]//td[3]') for i in zip(descs, hrefs): if i[0].text_content().strip().find(' 2.02') > -1: lnk = 'https://www.sec.gov' + i[1].get('href') con = Documents(lnk).content if con['Accepted'][:4] == '2014': break sleep(0.2) dfSECFileTimes = dfSECFileTimes.append( pd.DataFrame([[row.ticker, con['Accepted']]], columns=dfsftcols)) print(" ".join([row.ticker, con['Accepted'], lnk]))
def get_edgar_filing_text(comp_tuples, f_type, n_docs, file_dir, dates_dir): ''' --------------------------------------------------------------------------- Scraping function to get the text from company filings from EDGAR. Inputs: - comp_tuples : A list with pairwise company tuples. The first element must be a string with the company name as listed on the EDGAR database. The second element must be a string with the CIK identifier. See get_sp500_tickers_cik_industry(argin) to easily get the tuples from the S&P500 listed firms. - f_type : A string with the filing type. - n_docs : Number of filings to be fetched, in descending order, i.e. n_docs = 3 will fetch the three newest filings of type f_type. As a double integer. - file_dir : The master directory where all filings are to be saved. As a string. - dates_dir : The master directory where all filing dates are saved. If a directory is missing, the function will instead scrape the dates using get_edgar_filing_date(argin), and create a new folder with the dates. Example: comp_tuples = [['APPLE INC' , '0000320193'], ['MCDONALDS CORP', '0000063908'], ['MICROSOFT CORP', '0000789019']] f_type = '10-K' [Or '10-Q'] n_docs = 3 file_dir = 'Users/Tobias/Dropbox/textfolder/Text Reports U.S.' dates_dir = 'Users/Tobias/Dropbox/textfolder/Dates Reports U.S' get_edgar_filing_text(comp_tuples,f_type,n_docs,file_dir,dates_dir) --------------------------------------------------------------------------- ''' print('Fetching data...') print('-' * 80 + '\n') for idx, comp_tuple in enumerate(comp_tuples): comp = edgar.Company(comp_tuple[0], comp_tuple[1]) tree = comp.get_all_filings(filing_type=f_type) docs = Company.get_documents(tree, no_of_documents=n_docs) # Now that we have the filings, find get the filing dates for each # document. If we have them already, then great, let's load them. If # not, call get_edgar_filing_date to get them for this company. dir = os.path.join(dates_dir, f_type, comp_tuple[0]) if not os.path.exists(dir): print(('\nCannot find the dates for ' + comp_tuple[0] + '. Attempting to download them...')) get_edgar_filing_date([comp_tuple], f_type, dates_dir) else: os.chdir(dir) if '.' in comp_tuple[0][-1]: comp_tuple[0] = comp_tuple[0][:-1] with open(comp_tuple[0] + '.pickle', 'rb') as file: dates = pickle.load(file) dates = dates[:n_docs] dir = os.path.join(file_dir + '\\' + f_type + '\\' + comp_tuple[0]) if not os.path.exists(file_dir + '\\' + f_type + '\\' + comp_tuple[0]): os.makedirs(file_dir + '\\' + f_type + '\\' + comp_tuple[0]) os.chdir(file_dir + '\\' + f_type + '\\' + comp_tuple[0]) for date, doc in zip(dates, docs): f = open(date.replace('.pickle', '') + '.txt', 'w', encoding='utf8') f.write(str(doc)) f.close() mes = ('Status: ' + str(int( (idx + 1) / len(comp_tuples) * 100)) + '% done') sys.stdout.write('\r' + mes)