def download(tickers): path = get_filings_folder() dl = Downloader(path) n = len(tickers) for i in range(n): print_progress(i, n) if os.path.exists('../Filings/sec_edgar_filings/' + tickers[i]) == False: dl.get_10k_filings(tickers[i])
def form_valid(self, form): #get user and validate form form.instance.user = self.request.user super(Createlisting, self).form_valid(form) #get Company CIK tik = form.data['ticker'] URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany' CIK_RE = re.compile(r'.*CIK=(\d{10}).*') f = requests.get(URL.format(tik), stream=True) results = CIK_RE.findall(f.text) if len(results): cik = results[0] cmp_name = self.edgar.getCompanyNameByCik(cik) #create object in database company = Company(ticker=tik, cik=cik, name=cmp_name, user=self.request.user) company.save() # delete empty database queryset = Company.objects.filter(name='').delete() # Creating filename and url structure BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) path = os.path.join(BASE_DIR, 'static') out_path = path + "/sec_edgar_filings/" + tik + "/10-K/" # creating object of class with path to the download and downloading the txt file which is too big to load without xml iteration && should look for alternative option than that dl = Downloader(path) # download the latest one by adding the 1 dl.get_10k_filings(tik, 1) #removing the - and .txt to get the number to the current filing submission accession number file_name = [ f for f in os.listdir(out_path) if os.path.isfile(os.path.join(out_path, f)) ] switched_filename = file_name[0] switched_filename = switched_filename.replace('-', '').replace( '.txt', '/index.json') # creating base url configuration, i can do a better job than this!!! bare_url = r"https://www.sec.gov/Archives/edgar/data/" base_url = r"https://www.sec.gov" documents_url = bare_url + str(results[0]) + "/" + switched_filename #retreieve the files and get the summary content = requests.get(documents_url).json() for file in content['directory']['item']: # Grab the filing summary and create a new url leading to the file so we can download it. if file['name'] == 'FilingSummary.xml': xml_summary = base_url + content['directory'][ 'name'] + "/" + file['name'] #print info print('-' * 50) print('File Name: ' + file['name']) print('File Path: ' + xml_summary) # define a new base url that represents the filing folder. This will come in handy when we need to download the reports. base_url = xml_summary.replace('FilingSummary.xml', '') # request and parse the content content = requests.get(xml_summary).content soup = BeautifulSoup(content, 'lxml') # find the 'myreports' tag because this contains all the individual reports submitted. reports = soup.find('myreports') # I want a list to store all the individual components of the report, so create the master list. master_reports = [] # loop through each report in the 'myreports' tag but except the last one it produces an error. for report in reports.find_all('report')[:-1]: # create a dictionary to store all the different parts we need. report_dict = {} report_dict['name_short'] = report.shortname.text report_dict['name_long'] = report.longname.text report_dict['position'] = report.position.text report_dict['menu_category'] = report.menucategory.text report_dict['url'] = base_url + report.htmlfilename.text # append the dictionary to the master list. master_reports.append(report_dict) if report_dict[ 'name_short'] == 'Consolidated Statements of Cash Flows': # print the info. print('-' * 50) print(base_url + report.htmlfilename.text) print(report.longname.text) print(report.shortname.text) print(report.menucategory.text) print(report.position.text) # creating a holder for the url since a Bug creating a different file path into the database! redirect_url_to_statement = base_url + report.htmlfilename.text # in case of multiple statements statements_url = [] for report_dict in master_reports: # define the statements we want to look for. item1 = r"Consolidated Statements of Cash Flows" # store them in a list. report_list = [item1] # if the short name can be found in the report list. if report_dict['name_short'] in report_list: # print some info and store it in the statements url. print('-' * 50) print(report_dict['name_short']) print(report_dict['url']) statements_url.append(report_dict['url']) statement = Statement(year=2019, type="CONSOLIDATED STATEMENTS OF CASH FLOWS", url=redirect_url_to_statement, company=company) statement.save() statements_data = [] # loop through each statement url for statementUrl in statements_url: # define a dictionary that will store the different parts of the statement. statement_data = {} statement_data['headers'] = [] statement_data['sections'] = [] statement_data['data'] = [] # request the statement file content content = requests.get(statementUrl).content report_soup = BeautifulSoup(content, 'html') # find all the rows, figure out what type of row it is, parse the elements, and store in the statement file list. for index, row in enumerate(report_soup.table.find_all('tr')): # first let's get all the elements. cols = row.find_all('td') # if it's a regular row and not a section or a table header if (len(row.find_all('th')) == 0 and len(row.find_all('strong')) == 0): reg_row = [ele.text.strip() for ele in cols] statement_data['data'].append(reg_row) # if it's a regular row and a section but not a table header elif (len(row.find_all('th')) == 0 and len(row.find_all('strong')) != 0): sec_row = cols[0].text.strip() statement_data['sections'].append(sec_row) # finally if it's not any of those it must be a header elif (len(row.find_all('th')) != 0): hed_row = [ele.text.strip() for ele in row.find_all('th')] statement_data['headers'].append(hed_row) else: print('We encountered an error.') #Creating DAtA into Database #Creating each header and rotating thru all data values print("HEADERSSSSS ") print("Saving Headers...") for i in range(len(statement_data['headers'][1])): print(statement_data['headers'][1][i]) statementHeader = Statment_element_headers( field=statement_data['headers'][1][i], statement=statement) statementHeader.save() print("DATAAAAAAAA ") print("Saving Data Element...") for j in statement_data['data']: print(j) print(j[i + 1]) k = j[i + 1] # Optimizing the Data Format if '$' or ',' or '(' in k: k = k.replace('$', '') k = k.replace(' ', '') k = k.replace(',', '.') k = k.replace('(', '-') k = k.replace(')', '') k = float(k) print(k) statementData = Statement_element_data( key=j[0], value=k, statement=statement, company=company, header=statementHeader) statementData.save() print(j) print("Saving Data Done for Element") print("Saving Headers Done") print("SECTIONSSSS ") print("Saving Headers ...") for i in statement_data['sections']: print(i) statementSections = Statement_element_section( fieldName=i, statement=statement) statementSections.save() print("Saving Sections Done...") # append it to the master list for future analysis with Panda streams and NLP statements_data.append(statement_data) # print(statements_data) return redirect('home')
], axis=0) n_cik = nyse_nas.shape[0] error_list = [] print('Number of Stocks: ' + str(n_cik)) n = 0 # 從清單中刪除已下載 def diff(first, second): n = len(os.listdir('D:\\Thesis_data\\sec_edgar_filings')) second = set(second) return [item for item in first if str(item) not in second], n download_list, n = diff(list(nyse_nas['CIK']), os.listdir('D:\\Thesis_data\\sec_edgar_filings')) for cik in download_list: if n % 10 == 0: print('No.' + str(n) + ' is processing...' + str(n / n_cik) + '%') try: response = dl.get_10k_filings(cik, 25) except: error_list.append(cik) continue if not response: os.mkdir('D:\\Thesis_data\\sec_edgar_filings\\' + str(cik)) time.sleep(5) n = n + 1
#this script downloads 10-K and 10-Q reports for specified tickers from EDGAR using the sec_edgar_downloader package #and renames them so they resolve correctly as html files. import sec_edgar_downloader, os from sec_edgar_downloader import Downloader basepath = 'C:\\Users\\Dell\\OneDrive - George Mason University\\MBA 797\\Stock Data\\' SMIF_tickers = [ "GOOG", "AMZN", "BBT", "BA", "BMY", "CBRE", "CSCO", "C", "STZ", "CVA", "D", "XLE", "ESS", "FTNT", "GS", "HCP", "XLV", "HON", "JPM", "KSU", "LEN", "MSFT", "NEE", "PYPL", "PFE", "PNC", "RTN", "SYF", "TJX", "UNH", "VZ", "WMT", "DIS", "WDC" ] for i in range(0, len(SMIF_tickers)): dl = Downloader(basepath + SMIF_tickers[i]) dl.get_10k_filings(SMIF_tickers[i], 5) dl.get_10q_filings(SMIF_tickers[i], 4) for z in range(0, len(SMIF_tickers)): filelistK = os.listdir(basepath + SMIF_tickers[z] + "\\sec_edgar_filings\\" + SMIF_tickers[z] + "\\10-K\\") filelistQ = os.listdir(basepath + SMIF_tickers[z] + "\\sec_edgar_filings\\" + SMIF_tickers[z] + "\\10-Q\\") for i in range(0, len(filelistK)): os.rename( basepath + SMIF_tickers[z] + "\\sec_edgar_filings\\" + SMIF_tickers[z] + "\\10-K\\" + filelistK[i], basepath + SMIF_tickers[z] + "\\sec_edgar_filings\\" + SMIF_tickers[z] + "\\10-K\\" + filelistK[i][:-3] + "html")
def getFilings(ticker): # Get all 10-K and 10-Q filings for a ticker dl = Downloader(os.getcwd()) dl.get_10k_filings(ticker) dl.get_10q_filings(ticker) # Get the directories of the newly added files directoryK = os.getcwd() + "/sec_edgar_filings/" + ticker + "/10-K" directoryQ = os.getcwd() + "/sec_edgar_filings/" + ticker + "/10-Q" # Create dataframe to store information scraped from filings SECInfo = pd.DataFrame(columns=[ "Filing Type", "Filing Year", "Filing Date", "Net Income", "MDA Sentiment Analysis" ]) # For each new text file, go through and CLEAN IT! for filename in os.listdir(directoryK): if filename.endswith(".txt"): # Make a new cleaned file year = re.search('-(.*)-', filename).group(1) html = open(directoryK + "/" + filename) f = html.read() name = directoryK + "-cleaned" + "/" + ticker + "-" + year + "-" + "10K.txt" os.makedirs(os.path.dirname(name), exist_ok=True) # Store the sentiment of each word as the scraper goes through the MDA sentiment = [] # If there is an error, move onto the next file. try: # Convert the HTML to a readable format in the first file w = open(name, "w") w.write(html2text.html2text(f)) html.close() name2 = directoryK + "-MDA" + "/" + ticker + "-" + year + "-" + "10K-MDA.txt" os.makedirs(os.path.dirname(name2), exist_ok=True) w.close() # Convert the Readable Format to MDA in the second file wfile = open(name, "r") w = wfile.readlines() w2 = open(name2, "w") # For each line, check to see if it is the start of an MDA section or the start of the next section. flag = False for line in w: if flag or "discussion and analysis of" in line.lower( ).rstrip( ) or "management's discussion and analysis" in line.lower( ).rstrip(): # Make sure the line is legitimate and not all punctuation before adding if len(line) > 20 and count_punct( line) < 4 and " " in line: w2.write(line) flag = True # Conduct sentiment analysis pol_score = sid.polarity_scores(line) sentiment.append(pol_score["compound"]) if "financial statements and supplementary data" in line.lower( ).rstrip() or "statements and supplementary" in line.lower( ).rstrip(): flag = False # Get the time of the filing if "conformed period of report" in line.lower().rstrip(): filingDateRaw = line.lower().split("report: ", 1)[1][:8] filingDate = filingDateRaw[0:4] + "-" + filingDateRaw[ 4:6] + "-" + filingDateRaw[-2:] wfile.close() w2.close() # This is a placeholder value that I did not get to resolve netIncome = True try: SECInfo = SECInfo.append( { "Filing Type": "10-K", "Filing Year": year, "Filing Date": filingDate, "Net Income": netIncome, "MDA Sentiment Analysis": sentiment }, ignore_index=True) except UnboundLocalError: continue except (NotImplementedError, UnicodeEncodeError) as error: print("not implemented error for " + year) continue continue else: continue # This is the same loop as above except for 10-Q filings instead of 10-Ks. See thsoe comments. for filename in os.listdir(directoryQ): if filename.endswith(".txt"): year = re.search('-(.*)-', filename).group(1) html2 = open(directoryQ + "/" + filename) f = html2.read() name = directoryQ + "-cleaned" + "/" + ticker + "-" + year + "-" + "10Q.txt" print(name) flag = False os.makedirs(os.path.dirname(name), exist_ok=True) w = open(name, "w") try: w.write(html2text.html2text(f)) html2.close() name2 = directoryQ + "-MDA" + "/" + ticker + "-" + year + "-" + filename[ 14:20] + "-10Q-MDA.txt" os.makedirs(os.path.dirname(name2), exist_ok=True) w.close() wfile = open(name, "r") w = wfile.readlines() w2 = open(name2, "w") sentiment = [] flag = False for line in w: if flag or "s discussion and analysis of" in line.lower( ).rstrip( ) or "management's discussion and analysis" in line.lower( ).rstrip(): if len(line) > 20 and count_punct( line) < 5 and " " in line: w2.write(line) flag = True pol_score = sid.polarity_scores(line) sentiment.append(pol_score["compound"]) if "controls and procedures" in line.lower( ) or "in witness whereof" in line.lower( ) or "item 4." in line.lower(): flag = False # Get the time of the filing if "conformed period of report" in line.lower().rstrip(): filingDateRaw = line.lower().split("report: ", 1)[1][:8] filingDate = filingDateRaw[0:4] + "-" + filingDateRaw[ 4:6] + "-" + filingDateRaw[-2:] wfile.close() w2.close() SECInfo = SECInfo.append( { "Filing Type": "10-Q", "Filing Year": year, "Filing Date": filingDate, "Net Income": netIncome, "MDA Sentiment Analysis": sentiment }, ignore_index=True) except (NotImplementedError, UnicodeEncodeError) as error: w.close() print("not implemented error for " + year) continue continue else: continue # Convert the large DataFrame we have made to a CSV for later use. SECInfo.to_csv("sec_processed_filings/" + ticker + "-SEC-Information.csv")
import os import requests import re import edgar import sys # sys.setrecursionlimit(30000) # creating object of class with path to obslute download input_ticker = "WMT" path = "/home/merovingie/PycharmProjects/Sec-Scrap/secscrap/edgardownloadertrial" out_path = path + "/sec_edgar_filings/" + input_ticker + "/10-K/" print(out_path) dl = Downloader(path) # download the latest one dl.get_10k_filings(input_ticker, 1) # # create soup object # soup = BeautifulSoup(open("/home/merovingie/PycharmProjects/Sec-Scrap/secscrap/edgardownloadertrial/sec_edgar_filings/WMT/10-K/0000104169-19-000016.txt")) # print(soup.prettify()) DEFAULT_TICKERS = [input_ticker] URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany' CIK_RE = re.compile(r'.*CIK=(\d{10}).*') # change Ticker into CIK cik_dict = {} for ticker in DEFAULT_TICKERS: f = requests.get(URL.format(ticker), stream=True) results = CIK_RE.findall(f.text) if len(results):