def save(self, directory): """Save files in specified directory. Each txt url looks something like: https://www.sec.gov/Archives/edgar/data/1018724/000101872419000043/0001018724-19-000043.txt Args: directory (str): Path to directory where files should be saved. Returns: None Raises: ValueError: If no text urls are available for given filing object. """ urls = self.get_urls() if all(len(urls[cik]) == 0 for cik in urls.keys()): raise ValueError("No filings available.") for cik, links in urls.items(): for link in links: data = requests.get(link).text accession_number = link.split("/")[-1] path = os.path.join(directory, cik, self.filing_type.value) make_path(path) path = os.path.join(path, accession_number) with open(path, "w") as f: f.write(data)
def save(self, directory): """Save all daily filings. Will store all filings for each unique company name under a separate subdirectory within given directory argument. Ex: my_directory | ---- Apple Inc. | ---- ...txt files ---- Microsoft Corp. | ---- ...txt files Args: directory (str): Directory where filings should be stored. Will be broken down further by company name and form type. """ self.get_filings_dict() for filings in self._filings_dict.values(): # take the company name from the first filing and make that the subdirectory name subdirectory = os.path.join(directory, filings[0].company_name) make_path(subdirectory) for filing in filings: filename = filing.file_name.split('/')[-1] filing_path = os.path.join(subdirectory, filename) url = self.make_url(filename) data = requests.get(url).text with open(filing_path, 'w') as f: f.write(data)
def save(self, directory): """Save files in specified directory. Each txt url looks something like: https://www.sec.gov/Archives/edgar/data/1018724/000101872419000043/0001018724-19-000043.txt Args: directory (str): Path to directory where files should be saved. Returns: None Raises: ValueError: If no text urls are available for given filing object. """ urls = self.get_urls() if len(urls) == 0: raise ValueError("No filings available.") doc_names = [url.split("/")[-1] for url in urls] for (url, doc_name) in list(zip(urls, doc_names)): cik = doc_name.split('-')[0] data = requests.get(url).text path = os.path.join(directory, cik, self.filing_type.value) make_path(path) path = os.path.join(path, doc_name) with open(path, "w") as f: f.write(data)
def save_filings(self, directory): """Save all filings. Will store all filings for each unique CIK under a separate subdirectory within given directory argument. Ex: my_directory | ---- CIK 1 | ---- ...txt files ---- CIK 2 | ---- ...txt files Args: directory (str): Directory where filings should be stored. """ urls = self._check_urls_exist() for company, links in urls.items(): for link in links: data = requests.get(link).text path = os.path.join(directory, company) make_path(path) path = os.path.join(path, self.get_accession_number(link)) with open(path, "w") as f: f.write(data)
def save_filings(self, directory): """Save all filings. Will store all filings for each unique company name under a separate subdirectory within given directory argument. Ex: my_directory | ---- Apple Inc. | ---- ...txt files ---- Microsoft Corp. | ---- ...txt files Args: directory (str): Directory where filings should be stored. Will be broken down further by company name and form type. """ self.get_filings_dict() for filings in self._filings_dict.values(): # take the company name from the first filing and make that the subdirectory name clean_company_name = self.clean_directory_path( filings[0].company_name) subdirectory = os.path.join(directory, clean_company_name) # TODO: Clean company name to make valid directory name (get rid of special characters) make_path(subdirectory) for filing in filings: filename = self.get_accession_number(filing.file_name) filing_path = os.path.join(subdirectory, filename) url = self.make_url(filename) data = requests.get(url).text with open(filing_path, 'w') as f: f.write(data)
def test_make_path_expand_user(self): # make sure that you do not have a directory matching this if testing locally path_to_expand = "~/_____testing_____" utils.make_path(path_to_expand) path_expanded = os.path.expanduser(path_to_expand) try: assert os.path.exists(path_expanded) finally: os.rmdir(path_expanded)
def _save_filings(self, directory, dir_pattern="{cik}", file_pattern="{accession_number}", download_all=False): """Save all filings. Will store all filings under the parent directory of ``directory``, further separating filings using ``dir_pattern`` and ``file_pattern``. Args: directory (str): Directory where filings should be stored. dir_pattern (str): Format string for subdirectories. Default is `{cik}`. Valid options are `{cik}`. file_pattern (str): Format string for files. Default is `{accession_number}`. Valid options are `{accession_number}`. download_all (bool): Type of downloading system, if true downloads all tar files, if false downloads each file in index. Default is `False`. """ urls = self._check_urls_exist() if download_all: # Download tar files into huge temp directory extract_directory = os.path.join(directory, 'temp') i = 0 while os.path.exists(extract_directory): # Ensure that there is no name clashing extract_directory = os.path.join(directory, 'temp{i}'.format(i=i)) i += 1 make_path(extract_directory) self._unzip(extract_directory=extract_directory) self._move_to_dest(urls=urls, extract_directory=extract_directory, directory=directory, file_pattern=file_pattern, dir_pattern=dir_pattern) # Remove the initial extracted data shutil.rmtree(extract_directory) else: inputs = [] for company, links in urls.items(): formatted_dir = dir_pattern.format(cik=company) for link in links: formatted_file = file_pattern.format( accession_number=self.get_accession_number(link)) path = os.path.join(directory, formatted_dir, formatted_file) inputs.append((link, path)) loop = asyncio.get_event_loop() loop.run_until_complete( self.client.wait_for_download_async(inputs))
def _do_create_and_copy(q): """Create path and copy file to end of path. Args: q (Queue.queue): Queue to get filename, new directory, and old path information from. """ while True: try: filename, new_dir, old_path = q.get(timeout=1) except Empty: return make_path(new_dir) path = os.path.join(new_dir, filename) shutil.copyfile(old_path, path) q.task_done()
async def fetch_and_save(link, path, session): """Fetch link and save to path using session.""" contents = await self.fetch(link, session) make_path(os.path.dirname(path)) with open(path, "wb") as f: f.write(contents)
def process(self, infile, out_dir=None, create_subdir=True, rm_infile=False): """Process a text file and save processed files. Args: infile (str): Full path to a text file. out_dir (str): Directory to store output files. Defaults to the parent directory of infile. create_subdir (bool): If a subdirectory with the name of the infile should be created. If this is not true, files will be prefixed with the infile filename. rm_infile (bool): If the infile should be removed after processing. Defaults to False. Returns: None """ if not infile.endswith('.txt'): raise ValueError( '{file} Does not appear to be a .txt file.'.format( file=infile)) with open(infile, encoding="utf8") as f: intxt = f.read() if out_dir is None: out_dir = os.path.dirname(infile) infile_base = os.path.basename(infile).split('.txt')[0] metadata_file_format = "{base}_{num}.metadata.json" document_file_format = '{base}_{sec_doc_num}.{file}' if create_subdir: out_dir = os.path.join(out_dir, infile_base) make_path(out_dir) metadata_file_format = "{num}.metadata.json" document_file_format = '{sec_doc_num}.{file}' sec_doc_cursor = 0 sec_doc_count = intxt.count("<SEC-DOCUMENT>") for sec_doc_num in range(sec_doc_count): sec_doc_match = self.re_sec_doc.search(intxt, pos=sec_doc_cursor) if not sec_doc_match: break sec_doc_cursor = sec_doc_match.span()[1] sec_doc = sec_doc_match.group(1) # metadata metadata_match = self.re_sec_header.search(sec_doc) metadata_txt = metadata_match.group(1) metadata_cursor = metadata_match.span()[1] metadata_filename = metadata_file_format.format(base=infile_base, num=sec_doc_num) metadata_file = os.path.join(out_dir, metadata_filename) metadata_dict = self.process_metadata(metadata_txt) # logging.info("Metadata written into {}".format(metadata_file)) # Loop through every document metadata_dict["documents"] = [] documents = sec_doc[metadata_cursor:].strip() doc_count = documents.count("<DOCUMENT>") doc_cursor = 0 for doc_num in range(doc_count): doc_match = self.re_doc.search(documents, pos=doc_cursor) if not sec_doc_match: break doc = doc_match.group(1) doc_cursor = doc_match.span()[1] doc_metadata = self.process_document_metadata(doc) metadata_dict["documents"].append(doc_metadata) # Get file data and file name doc_filename = doc_metadata["filename"] doc_txt = self.re_text.search(doc).group(1).strip() target_doc_filename = document_file_format.format( base=infile_base, sec_doc_num=sec_doc_num, file=doc_filename) doc_outfile = os.path.join(out_dir, target_doc_filename) is_uuencoded = doc_txt.find("begin 644 ") != -1 if is_uuencoded: logging.info( "{} contains an uu-encoded file".format(infile)) encfn = doc_outfile + ".uu" with open(encfn, "w", encoding="utf8") as encfh: encfh.write(doc_txt) uu.decode(encfn, doc_outfile) os.remove(encfn) else: logging.info( "{} contains an non uu-encoded file".format(infile)) with open(doc_outfile, "w", encoding="utf8") as outfh: outfh.write(doc_txt) # Save SEC-DOCUMENT metadata to file with open(metadata_file, "w", encoding="utf8") as fileh: formatted_metadata = json.dumps(metadata_dict, indent=2, sort_keys=True, ensure_ascii=False) fileh.write(formatted_metadata) if rm_infile: os.remove(infile)
async def fetch_and_save(link, path, session): contents = await self.fetch(link, session) make_path(os.path.dirname(path)) with open(path, "wb") as f: f.write(contents)