def parse(CIK, filing_year, company_name=None, raw_website_data=None, \ processed_website_data=None, get_legal_proceeding_only=False, get_litigation_footnotes_only=False): results = ParsingResults(CIK, filing_year, company_name, processed_text=processed_website_data) if results.company_name is None: results.company_name = edgar.get_name_of_company_from_cik(results.CIK) if results.processed_text is None: if raw_website_data is None: url = edgar.get_10k_url(CIK=results.CIK, filing_year=results.filing_year) if url is not None: response = urllib2.urlopen(url).read() CorpusAccess.write_raw_url_data_to_file(response, results.CIK, results.filing_year) else: raise Exception("Error: No URL to parse for data.") else: response = raw_website_data results.processed_text = convert_html_into_clean_text(response) _get_litigaton_mentions(results, get_legal_proceeding_only, get_litigation_footnotes_only) return results
def _get_raw_data(CIK, year): ''' process-safe way of accessing a given 10-K as indexed by CIK and filing year. method will store the data to disk if it's not already there ''' # maintain exclusive zone when acquiring raw data. # this section of the code could, based on OS scheduling, easily # lead to multiple download attempts of the same data. _corpus_access_mutex.acquire() raw_data = CorpusAccess.get_raw_website_data_from_corpus(CIK=CIK, filing_year=year) if raw_data is None: url = edgar.get_10k_url(CIK=CIK, filing_year=year) if url is not None: raw_data = urllib2.urlopen(url, timeout=Constants.URL_DOWNLOAD_TIMEOUT_IN_SECS).read() CorpusAccess.write_raw_url_data_to_file(raw_data, CIK=CIK, filing_year=year) _corpus_access_mutex.release() return raw_data