def read(self): #edgar = edgar.Edgar() #print(edgar.getCikByCompanyName("APPLE INC")) companies = [('APPLE INC.', '0000320193'), ('GOOGLE INC.', '0001288776')] for c in companies: culm = [0, 0] strArray = c[0].split() name = self.getName(c[0]) company = edgar.Company(c[0], c[1]) tree = company.getAllFilings(filingType='8-K') docs = edgar.getDocuments(tree, noOfDocuments=5) for doc in docs: #print('***' + str(c) + '***') totals = phraseFreq(doc) culm[0] += totals[0] culm[1] += totals[1] print('SEC: ' + str(c) + ' Totals: ' + str(culm)) decision = 'Hold' if culm[0] + culm[1] >= 6: decision = 'Buy' elif culm[0] + culm[1] <= -6: decision = 'Sell' #print(str(c) + ': ' + decision) self.companyMap[name] = culm
def extract_docs(comp_name, cik): company = edgar.Company(comp_name, cik) tree = company.getAllFilings(filingType="10-K") docs = edgar.getDocuments(tree, noOfDocuments=3) if ("This application relies heavily on JavaScript, you" in docs[0]): return docs[1] else: return docs[0]
def getUSFinanceData(cmd=None): import sys import os import django import edgar getConfig() sys.path.append(django_path) sys.path.append(main_path) os.environ.setdefault("DJANGO_SETTINGS_MODULE", "MainBoard.settings") django.setup() import detective_app.models as detective_db # workDir = r'%s' yyyymmdd = str(datetime.now())[:10] # url = "http://comp.fnguide.com/SVO2/ASP/SVD_Finance.asp" reportType = { # 101: 'snapshotUS', 103: 'financeReportUS' # 104: 'financeRatioUS' } # 101 : snapshot, 103 : financeReport, 104 : financeRatio # urlInfo = { # 101: 'http://comp.fnguide.com/SVO2/ASP/SVD_Main.asp', # 103: 'http://comp.fnguide.com/SVO2/ASP/SVD_Finance.asp', # 104: 'http://comp.fnguide.com/SVO2/ASP/SVD_FinanceRatio.asp' # } # url = 'https://finance.yahoo.com/quote/%s/financials?p=%s' # stockInfo = detective_db.USStocks.objects.filter(listing='Y') stockInfo = detective_db.USStocks.objects.filter(security='Oracle Corp.', listing='Y') # stockInfo = detective_db.Stocks.objects.filter(code='005930', listing='Y') for key in reportType.keys(): # print(cmd, cmd and key != cmd) if cmd and key != cmd: continue # workDir = r'%s\%s\%s' % (report_path, reportType[key], yyyymmdd) workDir = r'C:\Github\Waver\detective\reports\%s\%s' % ( reportType[key], yyyymmdd) if not os.path.exists(workDir): os.makedirs(workDir) for s in stockInfo: company = edgar.Company(s.security, s.cik) tree = company.getAllFilings(filingType="10-K") docs = edgar.getXMLDocuments(tree, noOfDocuments=1) # print(docs) for xml in docs: saveFile(workDir, s.cik, s.security.replace(' ', '_'), reportType[key], xml, 'w')
def get10KByNameAndCIK(companyName, CIKNumber, noOfDocuments=1): """ @companyName: the name of the company @CIKNumber: the cik number of the company @noOfDocuments: the number of differenct files for different years @return: the cleared file, decode from unicode and remove white space, in a list The user have to provide the CIK number by themselves """ company = edgar.Company(companyName, CIKNumber) tree = company.getAllFilings(filingType="10-K") docs = edgar.getDocuments(tree, noOfDocuments=noOfDocuments) if isinstance(docs, list): file_lists = [clear_file(doc) for doc in docs] return file_lists else: return clear_file(docs)
companies ciks = df['CIK'] ciks docs = df['DOC'] docs dfout = pd.DataFrame(columns=['DATE', 'CIK', 'DOC' , 'POLARITY' , 'SUBJECTIVITY']) dfnull = pd.DataFrame(columns=['DATE', 'CIK', 'DOC']) dfout dfnull itlen = len(df.index) for x in range(itlen): company = edgar.Company(str(companies[x]), str(ciks[x])) tree = company.getAllFilings(filingType = str(docs[x]), priorTo = str(dates[x])) filings = edgar.getDocuments(tree, noOfDocuments=1) filingstr = str(filings) article = TextBlob(filingstr) articlepolarity = article.sentiment.polarity articlesubjectivity = article.sentiment.subjectivity if(articlepolarity != 0 and articlesubjectivity != 0): dfout = dfout.append({'DATE': dates[x], 'CIK': ciks[x], 'DOC': docs[x], 'POLARITY' : articlepolarity, 'SUBJECTIVITY' : articlesubjectivity}, ignore_index = True) if(articlepolarity == 0 and articlesubjectivity == 0): dfnull = dfnull.append({'DATE' : dates[x], 'CIK' : ciks[x], 'DOC' : docs[x]}, ignore_index = True) dfout dfnull
import edgar """https://pypi.org/project/edgar/ This url contains the documentation for the edgar package used to retrieve SEC filings. **The code block below consists of three variable assignments which will retrieve the specified data.** The first line of code uses the *Company* function which requires two parameters: company name and CIK. The variable *company* is set to the company of interest using these two fields. The second line of code uses the *getAllFilings* function which returns a *lxml.html* form. There are four possible parameters for this function (filingType, priorTo, ownership, noOfEntries), however given the structure of the code block only the first two parameters are required in this instance. The *filingType* parameter can be specified for the type of document to be retrieved (10-K, S-8, 8-K). The *priorTo* parameter must be specified in a string as *YYYY-MM-DD*. The third line of code uses the *getDocuments* function of the *Edgar*-class which returns a list of strings in which each string contains the body of a specified document. The first parameter of this function takes a *lxml.html* form which is returned from the *getAllFilings* function from the line prior. The second parameter is *noOfDocuments* which is the number of documents to be returned; for usage in this IDE, this number should always be set to 1 to avoid crashing. """ company = edgar.Company("AMAZON COM INC", "0001018724") tree = company.getAllFilings(filingType = "10-K", priorTo = '2014-12-31') docs = edgar.getDocuments(tree, noOfDocuments=1) print(tree) print(str(docs)) docstr = str(docs) type(docs) article = TextBlob(docs) articlepolarity = article.sentiment.polarity articlesubjectivity = article.sentiment.subjectivity print(articlepolarity)
def get_edgar_filing_text(comp_tuples,f_type,n_docs,file_dir,dates_dir): ''' --------------------------------------------------------------------------- Scraping function to get the text from company filings from EDGAR. Inputs: - comp_tuples : A list with pairwise company tuples. The first element must be a string with the company name as listed on the EDGAR database. The second element must be a string with the CIK identifier. See get_sp500_tickers_cik_industry(argin) to easily get the tuples from the S&P500 listed firms. - f_type : A string with the filing type. - n_docs : Number of filings to be fetched, in descending order, i.e. n_docs = 3 will fetch the three newest filings of type f_type. As a double integer. - file_dir : The master directory where all filings are to be saved. As a string. - dates_dir : The master directory where all filing dates are saved. If a directory is missing, the function will instead scrape the dates using get_edgar_filing_date(argin), and create a new folder with the dates. Example: comp_tuples = [['APPLE INC' , '0000320193'], ['MCDONALDS CORP', '0000063908'], ['MICROSOFT CORP', '0000789019']] f_type = '10-K' [Or '10-Q'] n_docs = 3 file_dir = 'C:\\Users\\Tobias\\Dropbox\\Master\\Text Reports U.S.' dates_dir = 'C:\\Users\\Tobias\\Dropbox\\Master\\Dates Reports U.S' get_edgar_filing_text(comp_tuples,f_type,n_docs,file_dir,dates_dir) --------------------------------------------------------------------------- ''' print('Fetching data...') print('-'*80+ '\n') for idx, comp_tuple in enumerate(comp_tuples): comp = edgar.Company(comp_tuple[0],comp_tuple[1]) tree = comp.getAllFilings(filingType = f_type) docs = edgar.getDocuments(tree, noOfDocuments=n_docs) # Now that we have the filings, find get the filing dates for each # document. If we have them already, then great, let's load them. If # not, call get_edgar_filing_date to get them for this company. if not os.path.exists(dates_dir+ '\\' + f_type + '\\' + comp_tuple[0]): print(('\nCannot find the dates for ' + comp_tuple[0] + '. Attempts to download them...')) get_edgar_filing_date([comp_tuple],f_type,dates_dir) else: os.chdir(dates_dir + '\\' + f_type + '\\' + comp_tuple[0]) with open(comp_tuple[0] + '.pickle', 'rb') as file: dates = pickle.load(file) dates = dates[:n_docs] if not os.path.exists(file_dir + '\\' + f_type + '\\'+comp_tuple[0]): os.mkdir(file_dir + '\\' + f_type +'\\'+comp_tuple[0]) os.chdir(file_dir + '\\' + f_type +'\\'+comp_tuple[0]) for date, doc in zip(dates,docs): f = open(date.replace('.pickle','')+'.txt','w',encoding='utf8') f.write(str(doc)) f.close() # with open(date + '.pickle' , 'wb') as file: # pickle.dump(str(doc), file) mes=('Status: '+str( int((idx+1)/len(comp_tuples)*100) )+ '% done') sys.stdout.write('\r'+mes)
def countStatesApperance(doc): table = {} for word in doc.split(): if word in states: table[word] = 1 return len(table) ed = edgar.Edgar() c = "785814" c = c.zfill(10) #n = ed.getCompanyNameByCik(c) company = edgar.Company("INTEGRATED HEALTH SVCS INC", c) tree = company.getAllFilings(filingType="10-K") docs = edgar.getDocuments(tree, noOfDocuments=30) with io.open("C:/Users/William/Desktop/Output.txt", "w", encoding="utf-8") as f: f.write(docs[0]) if len(docs) > 0: print(countStatesApperance(docs[0])) print(extractYear(docs[0])) # For each 10-K # determine which fiscal year it is #with io.open("C:/Users/William/Desktop/Output.txt", "w", encoding="utf-8") as f: # f.write(docs[0])
import edgar edgar = edgar.Edgar() possible_companies = edgar.findCompanyName("Cisco System") print(possible_companies) #get Oracle Corp's last 5 form 10-K's company = edgar.Company("Oracle Corp", "0001341439") tree = company.getAllFilings(filingType="10-K") docs = edgar.getDocuments(tree, noOfDocuments=5) #docs is an array of strings, each one is the full text doc #SIC CODES url = "https://www.sec.gov/info/edgar/siccodes.htm" #Developer page #https://www.sec.gov/developer
# change Ticker into CIK cik_dict = {} for ticker in DEFAULT_TICKERS: f = requests.get(URL.format(ticker), stream=True) results = CIK_RE.findall(f.text) if len(results): cik = str(results[0]) cik_dict[str(ticker).upper()] = str(results[0]) print(cik_dict) # Use edgar to get text compilation of the lxml # Get Company name from CIK edgar1 = edgar.Edgar() cmp_name = edgar1.getCompanyNameByCik(results[0]) print(cmp_name) company = edgar.Company(cmp_name, cik) # Creating filename and url structure file_name = [ f for f in os.listdir(out_path) if os.path.isfile(os.path.join(out_path, f)) ] switched_filename = file_name[0] switched_filename = switched_filename.replace('-', '').replace( '.txt', '/index.json') print(switched_filename) print(file_name) bare_url = r"https://www.sec.gov/Archives/edgar/data/" base_url = r"https://www.sec.gov" documents_url = bare_url + str(results[0]) + "/" + switched_filename
def annual_filings(name, ID, year, doc_num): '''This function finds the correct document''' # get filings from package company = edgar.Company(name, ID) tree = company.getAllFilings(filingType="10-K") doc = edgar.getDocuments(tree, noOfDocuments=14) year = int(year) # search for the right document year d = doc_num while d < len(doc): filing = (re.sub('\\xa0|\\n', ' ', doc[d])) # see if the document is amended if '10-K/A' in filing[1:15]: d += 1 # if in the right fiscal year and remove the new lines and break elif re.search( r'FOR THE FISCAL YEAR ENDED\s*[0-9]*\s*[A-Z]*\s*' + str(year) + '|FOR THE\s*(FISCAL)? YEAR ENDED\s*(Commission File Number)?\W?\s*[A-Z]*\s*[0-9]*,\s*' + str(year), filing, re.IGNORECASE): filing = filing.replace('\n', '').replace('\t', '').replace( '\r', '').replace('Contents', ' ').upper().split('ITEM ') break elif re.search( r'FOR THE FISCAL YEAR ENDED\s*[0-9]*\s*[A-Z]*\s*' + str(year + 1) + '|FOR THE\s*(FISCAL)? YEAR ENDED\s*(Commission File Number)?\W?\s*[A-Z]*\s*[0-9]*,\s*' + str(year + 1), filing, re.IGNORECASE): d += 1 elif re.search( r'FOR THE FISCAL YEAR ENDED\s*[0-9]*\s*[A-Z]*\s*' + str(year - 1) + '|FOR THE\s*(FISCAL)? YEAR ENDED\s*(Commission File Number)?\W?\s*[A-Z]*\s*[0-9]*,\s*' + str(year - 1), filing, re.IGNORECASE): d -= 1 # for Google doc # elif re.search( r'FOR THE\s*(FISCAL)? YEAR ENDED\s*\W?\s*[A-Z]*\s*[0-9]*,\s*' + str(year - 2), filing, re.IGNORECASE): d -= 2 else: return name, None new_doc = [] start = [] stop = [] # remove characters from filing for item in filing: new_doc.append(re.sub('\\xa0*|(?<=[7-8])\W?s?', '', item)) for i in range(len(new_doc)): if re.search( r'7\s*\W?\.?\s*(AND 7A.)?(COMBINED)?\s*MANAGEMENT\W?\s*S\s*DISCUSSION\s*AND\s*ANALYSIS\s*OF\s*(CONSOLIDATED)?\s*FINANCIAL\s*CONDITION\S?\s*AND\s*RESULTS\s*OF\s*OPERATION\S?', new_doc[i]): start.append(i) if re.search( r'7\s*\W?\.?\s*MANAGEMENT\W?\s*S\s*DISCUSSION\s*AND\s*ANALYSIS\s*OF\s*RESULTS\s*OF\s*OPERATIONS\s*AND\s*FINANCIAL\s*CONDITION', new_doc[i]): start.append(i) if re.search(r'9\s*\W?\w?\.?\s*\W?\s*CONTROLS\s*AND\s*PROCEDURES', new_doc[i]): stop.append(i) return name, ID, start, stop, d
def MDA(name, ID, start_index, stop_index, doc_num, year): '''This function returns the MDA text of the 10-k filing using the indices provided and the correct document.''' # get filings from package company = edgar.Company(name, ID) tree = company.getAllFilings(filingType="10-K") doc = edgar.getDocuments(tree, noOfDocuments=doc_num + 1) if name == 'General Electric' and year > 2013: MDA = (re.sub('\\xa0|\\n', ' ', doc[doc_num])) MDA = MDA.replace('\n', '').replace('\t', '').replace('\r', '').replace( 'Contents', ' ').split('GE ' + str(year) + ' FORM 10-K') new_doc = [] for item in MDA: new_doc.append(re.sub('\\xa0*|(?<=[7-8])\W?s?', '', item)) for i in range(len(new_doc)): if re.search( r'MANAGEMENT\W?\s*S\s*DISCUSSION\s*AND\s*ANALYSIS\s*OF\s*FINANCIAL\s*CONDITION\s*AND\s*RESULTS\s*OF\s*OPERATIONS\s*\(MD', new_doc[i]): start_index = i if re.search( r'MANAGEMENT\W?\s*S\s*ANNUAL\s*REPORT\s*ON\s*INTERNAL\s*CONTROL\s*OVER\s*FINANCIAL\s*REPORTING', new_doc[i]): stop_index = i MDA_text = new_doc[start_index:stop_index] return name, MDA_text # create the same format used from the function above else: filing = (re.sub('\\xa0|\\n', ' ', doc[doc_num])) filing = filing.replace('\n', '').replace('\t', '').replace( '\r', '').replace('Contents', ' ').upper().split('ITEM ') new_doc = [] for item in filing: new_doc.append(re.sub('\\xa0*|(?<=[7-8])\W?s?', '', item)) # find the text using the indices if len(start_index) == 2 and len(stop_index) == 2: if stop_index[-1] - start_index[-1] > 7: MDA_text = new_doc[start_index[1]:stop_index[1]] else: # for JP Morgan for i in range(len(new_doc)): if re.search(r'CONTENTS\s*FINANCIAL:\s+', new_doc[i]): MDA_text = new_doc[i:] return name, MDA_text # for Chevron elif re.search( r'FINANCIAL TABLE OF\s*|BLANK\)\s*INDEX TO MANAGEMENT\W?S DISCUSSION AND ANALYSIS', new_doc[i]): MDA_text = new_doc[i:] return name, MDA_text # for Exxon Mobil elif re.search( r'FINANCIAL SECTION\s+TABLE OF\s+CONTENTS\s+BUSINESS PROFILE', new_doc[i]): MDA_text = new_doc[i:] return name, MDA_text else: MDA_text = None elif len(start_index) > 5: MDA_text = new_doc[start_index[-4]:stop_index[-1]] elif len(start_index) > 2: if stop_index[-1] - start_index[-1] > 4: MDA_text = new_doc[start_index[-1]:stop_index[-1]] elif stop_index[-1] > start_index[-1]: MDA_text = new_doc[start_index[-2]:stop_index[-1]] else: for i in range(len(new_doc)): # for Exxon Mobil if re.search( r'FINANCIAL SECTION\s+TABLE OF\s+CONTENTS\s+BUSINESS PROFILE', new_doc[i]): MDA_text = new_doc[i:] return name, MDA_text # for Chevron 2013&2012 elif re.search( r'FINANCIAL TABLE OF\s*|BLANK\)\s*INDEX TO MANAGEMENT\W?S DISCUSSION AND ANALYSIS', new_doc[i]): MDA_text = new_doc[i:] return name, MDA_text else: MDA_text = None elif len(start_index) == 1 and stop_index[0] - start_index[0] > 8: if name != 'Wells Fargo': MDA_text = new_doc[start_index[0]:stop_index[0]] else: MDA_text = None else: MDA_text = None return name, MDA_text