def handleMissingEdgarIndex(edgar_addresses_dirpath, start_year=1993): print("Downloading EDGAR address tables to \"" + edgar_addresses_dirpath + "\"") if not os.path.exists(edgar_addresses_dirpath): os.mkdir(edgar_addresses_dirpath) import edgar edgar.download_index(edgar_addresses_dirpath, start_year)
def _download(year: int): #download large .tsv files containing all listed filings per qtr #Would be good to add a check on data_path for existing files to only download most up-to-date files if not os.path.exists(data_path): os.makedirs(data_path) edgar.download_index(data_path, year, skip_all_present_except_last=False) print("Downloading financial data") return os.listdir(data_path)
def GetEdgarIndex(self): edgarpath = os.path.expanduser(self.edgar_dir) year = self.start_year # need to set up data type check try: edgar.download_index(edgarpath, year) except Exception as err: print('Exception:', str(err))
def _download_indexes(self, since_year: float) -> List[str]: """Downloads company indexes from Edgar into temporary directory""" # Begin download edgar.download_index(self.tmp_dir, since_year=since_year) # Retrieve the file paths downloaded fpaths = [] for a, b, fnames in os.walk(self.tmp_dir): fpaths = [os.path.join(self.tmp_dir, f) for f in fnames] break self.logg.debug(f'Collected {len(fpaths)} files...') return sorted(fpaths)
def test_edgar(self): with tempfile.TemporaryDirectory() as tmpdirname: print("created temporary directory", tmpdirname) edgar.download_index(tmpdirname, 2019) file_name = tmpdirname + "/2019-QTR1.tsv" with open(file_name, "r", encoding="utf-8") as f: first_line = f.readline() self.assertEqual( first_line, "1000045|NICHOLAS FINANCIAL INC|10-Q|2019-02-14|edgar/data/1000045/0001193125-19-039489.txt|edgar/data/1000045/0001193125-19-039489-index.html\n", )
def get_index(self, since_year): """This function retrieves the complete index with all filing from the EDGAR SEC archives Parameters ---------- since_year : int The year as int, since when the filings should be loaded (lowest is 1993) path : str, optional The path where the data should be saved, by default "./data/SEC/" """ edgar.download_index(self.path, since_year, skip_all_present_except_last=False)
def main(argv): config_path = '' try: opts, args = getopt.getopt(argv, 'hc:', ['config_path=']) except getopt.GetoptError: info() sys.exit(2) for opt, arg in opts: if opt == '-h': info() sys.exit() elif opt in ('-c', '--config_path'): config_path = arg config = Config(config_path) edgar.download_index(config.master_path, config.since_year)
if sys.version_info[0] < 3: raise Exception("Must be using Python 3") parser = ArgumentParser() parser.add_argument( "-y", "--from-year", type=int, dest="year", help="The year from which to start downloading " + "the filing index. Default to current year", default=datetime.date.today().year, ) parser.add_argument( "-d", "--directory", dest="directory", help="A directory where the filing index files will" + "be downloaded to. Default to a temporary directory", default=tempfile.mkdtemp(), ) args = parser.parse_args() logger.debug("downloads will be saved to %s" % args.directory) edgar.download_index(args.directory, args.year) logger.info("Files downloaded in %s" % args.directory)
def main(): import edgar edgar.download_index("./data/index", 2015, skip_all_present_except_last=False)
# Install edgar package if not not already installed using "pip install edgar" import edgar edgar.download_index('C:\\Users\\Jayashree RAMAN\\Documents\\DellDocs\\Capstone\\EDGAR_Downloads', 2018)
## Import required packages ## Install the packages first if required import edgar import pandas as pd import requests import re import matplotlib.pyplot as plt ## Download the index file that consists list of companies with url to download the filings for a given year to the index_files folder ## Downloaded in tsv format edgar.download_index("./index_files/", 2018) ## Convert the tsv to a dataframe edgar_directories = pd.read_csv('./index_files/2018-QTR1.tsv', sep='|', header=None) ## Select only the 10K and 10Q filings edgar_10q_10k = edgar_directories[(edgar_directories[2] == '10-Q') | (edgar_directories[2] == '10-K')] ## Write the filtered filings list to a csv file for future use edgar_10q_10k.to_csv('EDGAR_10K_10Q.csv') ## Read the csv file to a dataframe Edgar_10K_DF = pd.read_csv('./index_files/EDGAR_10K_10Q.csv') ## Rename the column headers Edgar_10K_DF.columns = [ 'S.No', 'CompanyID', 'CompanyName', 'TypeOfFiling', 'DateFiled', 'TextURL', 'HTMLURL'
#This will clean up the SEC tsv files after running the script and extracting the relevant information. def purge_tsv_files(): for file in tsv_files: os.remove(directory_path + file) print('{} deleted'.format(file)) #Downloads the list of all company files including the location of those files for us to look through. if os.path.exists(directory_path + str(year) + '-' + 'QTR1.tsv') == True: print('No new files downloaded') tsv_files = os.listdir(directory_path) tsv_name_formatter() else: edgar.download_index(directory_path, year, skip_all_present_except_last=False) print('Files downloaded successfully') tsv_files = os.listdir(directory_path) tsv_name_formatter() #This for loop will take all the tsv files that were downloaded for the year and compile the different #financial reports for file in tsv_file_names: csv = pd.read_csv('/home/user/Documents/python_webscraper/' + file + '.tsv', sep='\t', lineterminator='\n', names=None) csv.columns.values[0] = 'Item'
def download_files(path): """ Returns filings made by SEC-controlled companies. """ return edgar.download_index(path, 2019, skip_all_present_except_last=False)
def get_indices(since_year_arg): if not os.path.exists("./indices"): os.makedirs("./indices") download_directory = "./indices/" since_year = since_year_arg edgar.download_index(download_directory, since_year)
parser.add_argument( "-ua", "--user-agent", dest="ua", help="The User Agent to set. This must be set properly " + "else the SEC may temporarily ban you. See https://www.sec.gov/os/accessing-edgar-data" ) parser.add_argument( "-s", "--skip-all-present-except-last", action="store_true", dest="skip", help="Specify this flag to skip downloading filing index" + " files that are already present. Only the most recent" + " file is downloaded. If not specified all files are" + " downloaded again." ) args = parser.parse_args() if args.ua is None: logger.error("A user agent is required. See https://www.sec.gov/os/accessing-edgar-data") sys.exit(1) logger.debug("downloads will be saved to %s" % args.directory) edgar.download_index(args.directory, args.year, args.ua, args.skip) logger.info("Files downloaded in %s" % args.directory)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ @author: yxy """ import edgar # Download filings strating from 1994 edgar.download_index("/Users/yxy/Downloads/secfilings", 1994)
import argparse import edgar import pdfkit def update_index() edgar.download_index(dir, since_year) def __main__(): parser = argparse.ArgumentParser(description='Interact with EDGAR https://www.sec.gov/edgar.shtml') parser.add_argument('--config', help='Path to an edgarcli config file') subparsers = parser.add_subparsers(dest='subcommand') download_parser = subparsers.add_parser('download', help='download filings from EDGAR') search_parser = subparsers.add_parser('search', help='search stuff from EDGAR') download_parser.add_argument('cik', help='CIK number') download_parser.add_argument() args = parser.parse_args() print(args.accumulate(args.integers))
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) logger.addHandler(ch) if __name__ == "__main__": parser = ArgumentParser() parser.add_argument( "-y", "--from-year", type=int, dest="year", help='The year from which to start downloading ' + 'the filing index. Default to current year', default=datetime.date.today().year) parser.add_argument( "-d", "--directory", dest="directory", help='A directory where the filing index files will' + 'be downloaded to. Default to a temporary directory', default=tempfile.mkdtemp()) args = parser.parse_args() logger.debug("downloads will be saved to %s" % args.directory) edgar.download_index(args.directory, args.year) logger.info("Files downloaded in %s" % args.directory)