def download_filing_index_data(year: int = None): """ Download all filing index data. :param year: :return: """ # Get filing index list if year is not None: filing_index_list = openedgar.clients.edgar.list_index_by_year(year) else: filing_index_list = openedgar.clients.edgar.list_index() path_list = [] configured_client = os.environ["CLIENT_TYPE"] logger.info(msg="Configured client is: {}".format(configured_client)) path_prefix = str() if configured_client is None or configured_client == "S3": # Create S3 client download_client = S3Client() else: download_client = LocalClient() path_prefix = os.environ["DOWNLOAD_PATH"] # Now iterate through list to check if already on S3 for filing_index_path in filing_index_list: # Cleanup path if filing_index_path.startswith("/Archives/"): file_path = os.path.join(path_prefix, filing_index_path[len("/Archives/"):]) else: file_path = os.path.join(path_prefix, filing_index_path) # Check if exists in database try: filing_index = FilingIndex.objects.get(edgar_url=filing_index_path) is_processed = filing_index.is_processed logger.info("Index {0} already exists in DB.".format(filing_index_path)) except FilingIndex.DoesNotExist: is_processed = False logger.info("Index {0} does not exist in DB.".format(filing_index_path)) # Check if exists; download and upload to S3 if missing if not download_client.path_exists(file_path): # Download buffer, _ = openedgar.clients.edgar.get_buffer(filing_index_path) # Upload download_client.put_buffer(file_path, buffer) logger.info("Retrieved {0} and uploaded to S3.".format(filing_index_path)) path_list.append((file_path, True, is_processed)) else: logger.info("Index {0} already exists on S3.".format(filing_index_path)) path_list.append((file_path, False, is_processed)) # Return list of updates return path_list
def uploading_text_in_filing_documents(store_raw: False, store_text: True): client = LocalClient() processed_filings = Filing.objects.filter(is_processed=True) for filing in processed_filings: buffer_data = client.get_buffer(filing.s3_path) logger.info("parsing id# {0} s3_path: {1}".format( filing.id, filing.s3_path)) filing_data = openedgar.parsers.edgar.parse_filing(buffer_data, extract=True) filing_documents = filing.filingdocument_set.all() logger.info("number of FilingDocument objects calculated: {0}".format( len(filing_documents))) documents_data = filing_data["documents"] logger.info("number of documents coming from data stream: {0}".format( len(documents_data))) # Iterate through documents for document in filing_documents: logger.info("WE'RE IN!!!!!!!!!!!!!!!!!!!!!") filing_data = None for d in documents_data: logger.info("documents_data sequence: {0} type: {1}".format( d["sequence"], type(d["sequence"]))) logger.info("FilingDocument sequence: {0} type: {1}".format( document.sequence, type(document.sequence))) if int(d["sequence"]) == document.sequence: logger.info("YAY") filing_data = d if filing_data is not None: # Upload text to S3 if requested if store_text and filing_data["content_text"] is not None: raw_path = pathlib.Path(DOCUMENT_PATH, "text", filing_data["sha1"]).as_posix() if not client.path_exists(raw_path): client.put_buffer(raw_path, filing_data["content_text"], write_bytes=False) logger.info( "Uploaded text contents for filing={0}, sequence={1}, sha1={2}" .format(filing, filing_data["sequence"], filing_data["sha1"])) else: logger.info( "Text contents for filing={0}, sequence={1}, sha1={2} already exists on S3" .format(filing, filing_data["sequence"], filing_data["sha1"])) else: document.is_processed = False document.is_error = True document.save()
def content(self): client = LocalClient() filing_buffer = client.get_buffer(self.filing.s3_path) return filing_buffer[self.start_pos:self.end_pos]
def get_buffer(self, filing_path): logger.info("Retrieving buffer from S3...") client = LocalClient() filing_buffer = client.get_buffer(filing_path) return openedgar.parsers.edgar.parse_filing(filing_buffer)
def process_filing_index(client_type: str, file_path: str, filing_index_buffer: Union[str, bytes] = None, form_type_list: Iterable[str] = None, store_raw: bool = False, store_text: bool = False): """ Process a filing index from an S3 path or buffer. :param file_path: S3 or local path to process; if filing_index_buffer is none, retrieved from here :param filing_index_buffer: buffer; if not present, s3_path must be set :param form_type_list: optional list of form type to process :param store_raw: :param store_text: :return: """ # Log entry logger.info("Processing filing index {0}...".format(file_path)) if client_type == "S3": client = S3Client() else: client = LocalClient() # Retrieve buffer if not passed if filing_index_buffer is None: logger.info("Retrieving filing index buffer for: {}...".format(file_path)) filing_index_buffer = client.get_buffer(file_path) # Write to disk to handle headaches temp_file = tempfile.NamedTemporaryFile(delete=False) temp_file.write(filing_index_buffer) temp_file.close() # Get main filing data structure filing_index_data = openedgar.parsers.edgar.parse_index_file(temp_file.name) logger.info("Parsed {0} records from index".format(filing_index_data.shape[0])) # Iterate through rows bad_record_count = 0 for _, row in filing_index_data.iterrows(): # Check for form type whitelist if form_type_list is not None: if row["Form Type"] not in form_type_list: logger.info("Skipping filing {0} with form type {1}...".format(row["File Name"], row["Form Type"])) continue # Cleanup path if row["File Name"].lower().startswith("data/"): filing_path = "edgar/{0}".format(row["File Name"]) elif row["File Name"].lower().startswith("edgar/"): filing_path = row["File Name"] # Check if filing record exists try: filing = Filing.objects.get(s3_path=filing_path) logger.info("Filing record already exists: {0}".format(filing)) except Filing.MultipleObjectsReturned as e: # Create new filing record logger.error("Multiple Filing records found for s3_path={0}, skipping...".format(filing_path)) logger.info("Raw exception: {0}".format(e)) continue except Filing.DoesNotExist as f: # Create new filing record logger.info("No Filing record found for {0}, creating...".format(filing_path)) logger.info("Raw exception: {0}".format(f)) # Check if exists; download and upload to S3 if missing if not client.path_exists(filing_path): # Download try: filing_buffer, _ = openedgar.clients.edgar.get_buffer("/Archives/{0}".format(filing_path)) except RuntimeError as g: logger.error("Unable to access resource {0} from EDGAR: {1}".format(filing_path, g)) bad_record_count += 1 create_filing_error(row, filing_path) continue # Upload client.put_buffer(filing_path, filing_buffer) logger.info("Downloaded from EDGAR and uploaded to {}...".format(client_type)) else: # Download logger.info("File already stored on {}, retrieving and processing...".format(client_type)) filing_buffer = client.get_buffer(filing_path) # Parse filing_result = process_filing(client, filing_path, filing_buffer, store_raw=store_raw, store_text=store_text) if filing_result is None: logger.error("Unable to process filing.") bad_record_count += 1 create_filing_error(row, filing_path) # Create a filing index record edgar_url = "/Archives/{0}".format(file_path).replace("//", "/") try: filing_index = FilingIndex.objects.get(edgar_url=edgar_url) filing_index.total_record_count = filing_index_data.shape[0] filing_index.bad_record_count = bad_record_count filing_index.is_processed = True filing_index.is_error = False filing_index.save() logger.info("Updated existing filing index record.") except FilingIndex.DoesNotExist: filing_index = FilingIndex() filing_index.edgar_url = edgar_url filing_index.date_published = None filing_index.date_downloaded = datetime.date.today() filing_index.total_record_count = filing_index_data.shape[0] filing_index.bad_record_count = bad_record_count filing_index.is_processed = True filing_index.is_error = False filing_index.save() logger.info("Created new filing index record.") # Delete file if we make it this far os.remove(temp_file.name)
def process_company_filings(client_type: str, cik: str, store_raw: bool = False, store_text: bool = False): """ Process a filing index from an S3 path or buffer. :param file_path: S3 or local path to process; if filing_index_buffer is none, retrieved from here :param filing_index_buffer: buffer; if not present, s3_path must be set :param form_type_list: optional list of form type to process :param store_raw: :param store_text: :return: """ # Log entry logger.info("Processing company cik {0}...".format(cik)) # Get path to filings folder for cik cik_path = openedgar.clients.edgar.get_cik_path(cik) links = links_10k(cik) if client_type == "S3": client = S3Client() else: client = LocalClient() # Iterate through links bad_record_count = 0 for row in links: # Cleanup path if row.lower().startswith("data/"): filing_path = "edgar/{0}".format(row) elif row.lower().startswith("edgar/"): filing_path = row # Check if filing record exists try: filing = Filing.objects.get(s3_path=filing_path) logger.info("Filing record already exists: {0}".format(filing)) except Filing.MultipleObjectsReturned as e: # Create new filing record logger.error( "Multiple Filing records found for s3_path={0}, skipping...". format(filing_path)) logger.info("Raw exception: {0}".format(e)) continue except Filing.DoesNotExist as f: # Create new filing record logger.info("No Filing record found for {0}, creating...".format( filing_path)) logger.info("Raw exception: {0}".format(f)) # Check if exists; download and upload to S3 if missing if not client.path_exists(filing_path): # Download try: filing_buffer, _ = openedgar.clients.edgar.get_buffer( "/Archives/{0}".format(filing_path)) except RuntimeError as g: logger.error( "Unable to access resource {0} from EDGAR: {1}".format( filing_path, g)) bad_record_count += 1 #create_filing_error(row, filing_path) continue # Upload client.put_buffer(filing_path, filing_buffer) logger.info( "Downloaded from EDGAR and uploaded to {}...".format( client_type)) else: # Download logger.info( "File already stored on {}, retrieving and processing...". format(client_type)) filing_buffer = client.get_buffer(filing_path) # Parse filing_result = process_filing(client, filing_path, filing_buffer, store_raw=store_raw, store_text=store_text) if filing_result is None: logger.error("Unable to process filing.") bad_record_count += 1
logger.setLevel(logging.INFO) console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') console.setFormatter(formatter) logger.addHandler(console) CLIENT_TYPE = "LOCAL_CLIENT" LOCAL_DOCUMENT_PATH = os.environ["DOWNLOAD_PATH"] DOCUMENT_PATH = "" if CLIENT_TYPE == "S3": client = S3Client() DOCUMENT_PATH = S3_DOCUMENT_PATH else: client = LocalClient() DOCUMENT_PATH = LOCAL_DOCUMENT_PATH def process_company_filings(client_type: str, cik: str, store_raw: bool = False, store_text: bool = False): """ Process a filing index from an S3 path or buffer. :param file_path: S3 or local path to process; if filing_index_buffer is none, retrieved from here :param filing_index_buffer: buffer; if not present, s3_path must be set :param form_type_list: optional list of form type to process :param store_raw: :param store_text: :return: