Exemplo n.º 1
0
def download(tickers):
    path = get_filings_folder()
    dl = Downloader(path)
    n = len(tickers)
    for i in range(n):
        print_progress(i, n)
        if os.path.exists('../Filings/sec_edgar_filings/' +
                          tickers[i]) == False:
            dl.get_10k_filings(tickers[i])
Exemplo n.º 2
0
    def form_valid(self, form):
        #get user and validate form
        form.instance.user = self.request.user
        super(Createlisting, self).form_valid(form)

        #get Company CIK
        tik = form.data['ticker']
        URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
        CIK_RE = re.compile(r'.*CIK=(\d{10}).*')
        f = requests.get(URL.format(tik), stream=True)
        results = CIK_RE.findall(f.text)
        if len(results):
            cik = results[0]

        cmp_name = self.edgar.getCompanyNameByCik(cik)

        #create object in database
        company = Company(ticker=tik,
                          cik=cik,
                          name=cmp_name,
                          user=self.request.user)
        company.save()

        # delete empty database
        queryset = Company.objects.filter(name='').delete()

        # Creating filename and url structure
        BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        path = os.path.join(BASE_DIR, 'static')
        out_path = path + "/sec_edgar_filings/" + tik + "/10-K/"

        # creating object of class with path to the download and downloading the txt file which is too big to load without xml iteration && should look for alternative option than that
        dl = Downloader(path)

        # download the latest one by adding the 1
        dl.get_10k_filings(tik, 1)

        #removing the - and .txt to get the number to the current filing submission accession number
        file_name = [
            f for f in os.listdir(out_path)
            if os.path.isfile(os.path.join(out_path, f))
        ]
        switched_filename = file_name[0]
        switched_filename = switched_filename.replace('-', '').replace(
            '.txt', '/index.json')

        # creating base url configuration, i can do a better job than this!!!
        bare_url = r"https://www.sec.gov/Archives/edgar/data/"
        base_url = r"https://www.sec.gov"
        documents_url = bare_url + str(results[0]) + "/" + switched_filename

        #retreieve the files and get the summary
        content = requests.get(documents_url).json()

        for file in content['directory']['item']:

            # Grab the filing summary and create a new url leading to the file so we can download it.
            if file['name'] == 'FilingSummary.xml':

                xml_summary = base_url + content['directory'][
                    'name'] + "/" + file['name']

                #print info
                print('-' * 50)
                print('File Name: ' + file['name'])
                print('File Path: ' + xml_summary)

        # define a new base url that represents the filing folder. This will come in handy when we need to download the reports.
        base_url = xml_summary.replace('FilingSummary.xml', '')

        # request and parse the content
        content = requests.get(xml_summary).content
        soup = BeautifulSoup(content, 'lxml')

        # find the 'myreports' tag because this contains all the individual reports submitted.
        reports = soup.find('myreports')

        # I want a list to store all the individual components of the report, so create the master list.
        master_reports = []

        # loop through each report in the 'myreports' tag but except the last one it produces an error.
        for report in reports.find_all('report')[:-1]:

            # create a dictionary to store all the different parts we need.
            report_dict = {}
            report_dict['name_short'] = report.shortname.text
            report_dict['name_long'] = report.longname.text
            report_dict['position'] = report.position.text
            report_dict['menu_category'] = report.menucategory.text
            report_dict['url'] = base_url + report.htmlfilename.text

            # append the dictionary to the master list.
            master_reports.append(report_dict)
            if report_dict[
                    'name_short'] == 'Consolidated Statements of Cash Flows':

                # print the info.
                print('-' * 50)
                print(base_url + report.htmlfilename.text)
                print(report.longname.text)
                print(report.shortname.text)
                print(report.menucategory.text)
                print(report.position.text)

                # creating a holder for the url since a Bug creating a different file path into the database!
                redirect_url_to_statement = base_url + report.htmlfilename.text

        # in case of multiple statements
        statements_url = []

        for report_dict in master_reports:

            # define the statements we want to look for.
            item1 = r"Consolidated Statements of Cash Flows"

            # store them in a list.
            report_list = [item1]

            # if the short name can be found in the report list.
            if report_dict['name_short'] in report_list:

                # print some info and store it in the statements url.
                print('-' * 50)
                print(report_dict['name_short'])
                print(report_dict['url'])

                statements_url.append(report_dict['url'])

        statement = Statement(year=2019,
                              type="CONSOLIDATED STATEMENTS OF CASH FLOWS",
                              url=redirect_url_to_statement,
                              company=company)
        statement.save()

        statements_data = []

        # loop through each statement url
        for statementUrl in statements_url:

            # define a dictionary that will store the different parts of the statement.
            statement_data = {}
            statement_data['headers'] = []

            statement_data['sections'] = []

            statement_data['data'] = []

            # request the statement file content
            content = requests.get(statementUrl).content
            report_soup = BeautifulSoup(content, 'html')

            # find all the rows, figure out what type of row it is, parse the elements, and store in the statement file list.
            for index, row in enumerate(report_soup.table.find_all('tr')):

                # first let's get all the elements.
                cols = row.find_all('td')

                # if it's a regular row and not a section or a table header
                if (len(row.find_all('th')) == 0
                        and len(row.find_all('strong')) == 0):
                    reg_row = [ele.text.strip() for ele in cols]
                    statement_data['data'].append(reg_row)

                # if it's a regular row and a section but not a table header
                elif (len(row.find_all('th')) == 0
                      and len(row.find_all('strong')) != 0):
                    sec_row = cols[0].text.strip()
                    statement_data['sections'].append(sec_row)

                # finally if it's not any of those it must be a header
                elif (len(row.find_all('th')) != 0):
                    hed_row = [ele.text.strip() for ele in row.find_all('th')]
                    statement_data['headers'].append(hed_row)

                else:
                    print('We encountered an error.')

            #Creating DAtA into Database
            #Creating each header and rotating thru all data values
            print("HEADERSSSSS   ")
            print("Saving Headers...")
            for i in range(len(statement_data['headers'][1])):
                print(statement_data['headers'][1][i])
                statementHeader = Statment_element_headers(
                    field=statement_data['headers'][1][i], statement=statement)
                statementHeader.save()
                print("DATAAAAAAAA   ")
                print("Saving Data Element...")
                for j in statement_data['data']:
                    print(j)
                    print(j[i + 1])
                    k = j[i + 1]
                    # Optimizing the Data Format
                    if '$' or ',' or '(' in k:
                        k = k.replace('$', '')
                        k = k.replace(' ', '')
                        k = k.replace(',', '.')
                        k = k.replace('(', '-')
                        k = k.replace(')', '')
                        k = float(k)
                        print(k)
                    statementData = Statement_element_data(
                        key=j[0],
                        value=k,
                        statement=statement,
                        company=company,
                        header=statementHeader)
                    statementData.save()

                    print(j)

                    print("Saving Data Done for Element")
            print("Saving Headers Done")

            print("SECTIONSSSS   ")
            print("Saving Headers ...")
            for i in statement_data['sections']:
                print(i)
                statementSections = Statement_element_section(
                    fieldName=i, statement=statement)
                statementSections.save()
            print("Saving Sections Done...")

            # append it to the master list for future analysis with Panda streams and NLP
            statements_data.append(statement_data)

        # print(statements_data)

        return redirect('home')
],
                     axis=0)

n_cik = nyse_nas.shape[0]
error_list = []
print('Number of Stocks: ' + str(n_cik))
n = 0


# 從清單中刪除已下載
def diff(first, second):
    n = len(os.listdir('D:\\Thesis_data\\sec_edgar_filings'))
    second = set(second)
    return [item for item in first if str(item) not in second], n


download_list, n = diff(list(nyse_nas['CIK']),
                        os.listdir('D:\\Thesis_data\\sec_edgar_filings'))

for cik in download_list:
    if n % 10 == 0:
        print('No.' + str(n) + ' is processing...' + str(n / n_cik) + '%')
    try:
        response = dl.get_10k_filings(cik, 25)
    except:
        error_list.append(cik)
        continue
    if not response:
        os.mkdir('D:\\Thesis_data\\sec_edgar_filings\\' + str(cik))
    time.sleep(5)
    n = n + 1
Exemplo n.º 4
0
#this script downloads 10-K and 10-Q reports for specified tickers from EDGAR using the sec_edgar_downloader package
#and renames them so they resolve correctly as html files.
import sec_edgar_downloader, os
from sec_edgar_downloader import Downloader

basepath = 'C:\\Users\\Dell\\OneDrive - George Mason University\\MBA 797\\Stock Data\\'
SMIF_tickers = [
    "GOOG", "AMZN", "BBT", "BA", "BMY", "CBRE", "CSCO", "C", "STZ", "CVA", "D",
    "XLE", "ESS", "FTNT", "GS", "HCP", "XLV", "HON", "JPM", "KSU", "LEN",
    "MSFT", "NEE", "PYPL", "PFE", "PNC", "RTN", "SYF", "TJX", "UNH", "VZ",
    "WMT", "DIS", "WDC"
]

for i in range(0, len(SMIF_tickers)):
    dl = Downloader(basepath + SMIF_tickers[i])
    dl.get_10k_filings(SMIF_tickers[i], 5)
    dl.get_10q_filings(SMIF_tickers[i], 4)

for z in range(0, len(SMIF_tickers)):
    filelistK = os.listdir(basepath + SMIF_tickers[z] +
                           "\\sec_edgar_filings\\" + SMIF_tickers[z] +
                           "\\10-K\\")
    filelistQ = os.listdir(basepath + SMIF_tickers[z] +
                           "\\sec_edgar_filings\\" + SMIF_tickers[z] +
                           "\\10-Q\\")
    for i in range(0, len(filelistK)):
        os.rename(
            basepath + SMIF_tickers[z] + "\\sec_edgar_filings\\" +
            SMIF_tickers[z] + "\\10-K\\" + filelistK[i],
            basepath + SMIF_tickers[z] + "\\sec_edgar_filings\\" +
            SMIF_tickers[z] + "\\10-K\\" + filelistK[i][:-3] + "html")
Exemplo n.º 5
0
def getFilings(ticker):

    # Get all 10-K and 10-Q filings for a ticker
    dl = Downloader(os.getcwd())
    dl.get_10k_filings(ticker)
    dl.get_10q_filings(ticker)

    # Get the directories of the newly added files
    directoryK = os.getcwd() + "/sec_edgar_filings/" + ticker + "/10-K"
    directoryQ = os.getcwd() + "/sec_edgar_filings/" + ticker + "/10-Q"

    # Create dataframe to store information scraped from filings
    SECInfo = pd.DataFrame(columns=[
        "Filing Type", "Filing Year", "Filing Date", "Net Income",
        "MDA Sentiment Analysis"
    ])

    # For each new text file, go through and CLEAN IT!
    for filename in os.listdir(directoryK):
        if filename.endswith(".txt"):

            # Make a new cleaned file
            year = re.search('-(.*)-', filename).group(1)
            html = open(directoryK + "/" + filename)
            f = html.read()
            name = directoryK + "-cleaned" + "/" + ticker + "-" + year + "-" + "10K.txt"
            os.makedirs(os.path.dirname(name), exist_ok=True)

            # Store the sentiment of each word as the scraper goes through the MDA
            sentiment = []

            # If there is an error, move onto the next file.
            try:

                # Convert the HTML to a readable format in the first file
                w = open(name, "w")
                w.write(html2text.html2text(f))
                html.close()
                name2 = directoryK + "-MDA" + "/" + ticker + "-" + year + "-" + "10K-MDA.txt"
                os.makedirs(os.path.dirname(name2), exist_ok=True)
                w.close()

                # Convert the Readable Format to MDA in the second file
                wfile = open(name, "r")
                w = wfile.readlines()
                w2 = open(name2, "w")

                # For each line, check to see if it is the start of an MDA section or the start of the next section.
                flag = False
                for line in w:

                    if flag or "discussion and analysis of" in line.lower(
                    ).rstrip(
                    ) or "management's discussion and analysis" in line.lower(
                    ).rstrip():

                        # Make sure the line is legitimate and not all punctuation before adding
                        if len(line) > 20 and count_punct(
                                line) < 4 and " " in line:
                            w2.write(line)
                        flag = True

                        # Conduct sentiment analysis
                        pol_score = sid.polarity_scores(line)
                        sentiment.append(pol_score["compound"])

                    if "financial statements and supplementary data" in line.lower(
                    ).rstrip() or "statements and supplementary" in line.lower(
                    ).rstrip():

                        flag = False

                    # Get the time of the filing
                    if "conformed period of report" in line.lower().rstrip():
                        filingDateRaw = line.lower().split("report: ",
                                                           1)[1][:8]
                        filingDate = filingDateRaw[0:4] + "-" + filingDateRaw[
                            4:6] + "-" + filingDateRaw[-2:]

                wfile.close()
                w2.close()

                # This is a placeholder value that I did not get to resolve
                netIncome = True

                try:
                    SECInfo = SECInfo.append(
                        {
                            "Filing Type": "10-K",
                            "Filing Year": year,
                            "Filing Date": filingDate,
                            "Net Income": netIncome,
                            "MDA Sentiment Analysis": sentiment
                        },
                        ignore_index=True)

                except UnboundLocalError:
                    continue
            except (NotImplementedError, UnicodeEncodeError) as error:
                print("not implemented error for " + year)
                continue

            continue
        else:
            continue

    # This is the same loop as above except for 10-Q filings instead of 10-Ks. See thsoe comments.
    for filename in os.listdir(directoryQ):
        if filename.endswith(".txt"):

            year = re.search('-(.*)-', filename).group(1)
            html2 = open(directoryQ + "/" + filename)
            f = html2.read()

            name = directoryQ + "-cleaned" + "/" + ticker + "-" + year + "-" + "10Q.txt"
            print(name)

            flag = False

            os.makedirs(os.path.dirname(name), exist_ok=True)
            w = open(name, "w")

            try:
                w.write(html2text.html2text(f))
                html2.close()

                name2 = directoryQ + "-MDA" + "/" + ticker + "-" + year + "-" + filename[
                    14:20] + "-10Q-MDA.txt"
                os.makedirs(os.path.dirname(name2), exist_ok=True)
                w.close()

                wfile = open(name, "r")
                w = wfile.readlines()

                w2 = open(name2, "w")

                sentiment = []

                flag = False
                for line in w:

                    if flag or "s discussion and analysis of" in line.lower(
                    ).rstrip(
                    ) or "management's discussion and analysis" in line.lower(
                    ).rstrip():

                        if len(line) > 20 and count_punct(
                                line) < 5 and " " in line:
                            w2.write(line)
                        flag = True

                        pol_score = sid.polarity_scores(line)
                        sentiment.append(pol_score["compound"])

                    if "controls and procedures" in line.lower(
                    ) or "in witness whereof" in line.lower(
                    ) or "item 4." in line.lower():
                        flag = False

                    # Get the time of the filing
                    if "conformed period of report" in line.lower().rstrip():
                        filingDateRaw = line.lower().split("report: ",
                                                           1)[1][:8]
                        filingDate = filingDateRaw[0:4] + "-" + filingDateRaw[
                            4:6] + "-" + filingDateRaw[-2:]

                wfile.close()
                w2.close()
                SECInfo = SECInfo.append(
                    {
                        "Filing Type": "10-Q",
                        "Filing Year": year,
                        "Filing Date": filingDate,
                        "Net Income": netIncome,
                        "MDA Sentiment Analysis": sentiment
                    },
                    ignore_index=True)

            except (NotImplementedError, UnicodeEncodeError) as error:
                w.close()
                print("not implemented error for " + year)
                continue

            continue
        else:
            continue

    # Convert the large DataFrame we have made to a CSV for later use.
    SECInfo.to_csv("sec_processed_filings/" + ticker + "-SEC-Information.csv")
Exemplo n.º 6
0
import os
import requests
import re
import edgar
import sys

# sys.setrecursionlimit(30000)

# creating object of class with path to obslute download
input_ticker = "WMT"
path = "/home/merovingie/PycharmProjects/Sec-Scrap/secscrap/edgardownloadertrial"
out_path = path + "/sec_edgar_filings/" + input_ticker + "/10-K/"
print(out_path)
dl = Downloader(path)
# download the latest one
dl.get_10k_filings(input_ticker, 1)

# # create soup object
# soup = BeautifulSoup(open("/home/merovingie/PycharmProjects/Sec-Scrap/secscrap/edgardownloadertrial/sec_edgar_filings/WMT/10-K/0000104169-19-000016.txt"))
# print(soup.prettify())

DEFAULT_TICKERS = [input_ticker]
URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
CIK_RE = re.compile(r'.*CIK=(\d{10}).*')

# change Ticker into CIK
cik_dict = {}
for ticker in DEFAULT_TICKERS:
    f = requests.get(URL.format(ticker), stream=True)
    results = CIK_RE.findall(f.text)
    if len(results):