def _write(d): filing = Filing( url=Filing.HTTP_ROOT + d['folder'] ) path = filing.md_path() print 'Writing:', path text = d['text'] if not d['text_file']: text = clean(text) with open(path, 'w') as f: f.write(text)
def create_filing(self, url): if not hasattr(self, 'client'): self.client = sec_ftp.Client() self.client.login() try: filing = Filing.get(url) except: filing = Filing(url=url) folder = url.replace(Filing.HTTP_ROOT, '') d = self.client.load(folder) filing.type = d['type'] filing.html = d['text'] return filing
def write_corpus(): paths = [Filing.folder_to_md_path(folder) for folder in CORPUS_FOLDERS] not_written = [ CORPUS_FOLDERS[i] for i, p in enumerate(paths) if not os.path.exists(p) ] dictionaries = _load(not_written) map(_write, dictionaries)
def scrape_page(self, page=None, counter=0, saved_page=False): """ Scrape html for filings data and add them to db. :param page: html :param counter: technical var to keep track of processed filings :param saved_page: boolean indicating whether to use a saved web page (for debugging) :return: number of processed filings """ # Retrieved last saved page (if available) for debugging purposes or send a request for starting page if saved_page: try: with open(self.lastpage_path, 'r') as input_file: page = input_file.read() except: page = self.load_page() soup = bs4.BeautifulSoup(page, features="html.parser") tables = soup.find_all("table", attrs={'xmlns:autn': "http://schemas.autonomy.com/aci/"}) if len(tables) == 0: print(f'{ats()} Something\'s wrong. No search results have been found!') sys.exit(1) # Iterate through rows of the search results table for tr in tables[0].select("tr"): # Identify top row of each search result (skip rows with classes blue and infoBorder). if 'class' in tr.attrs and any(css_class in tr.attrs['class'] for css_class in ['infoBorder', 'blue']): continue tds = tr.select("td") # Transform filing date in "mm/dd/yyyy" format to date object date_str = tds[0].text date_arr = date_str.split("/") filing_date = date(year=int(date_arr[2]), month=int(date_arr[0]), day=int(date_arr[1])) title_links = tds[1].select('a') if len(title_links) > 1: filing_title = title_links[0].text filing_type = filing_title[:6].strip() # should be either EX-102 or EX-103 # Skip EX-103 exhibits if filing_type == 'EX-103': continue filing_url = title_links[1].attrs['href'].strip() acc_no = filing_url.split("/")[7].strip() filer_cik = filing_url.split("/")[6].strip() # headline company cik filer_company = filing_title[21:].strip() # company name as appears in the headline trust_company = filer_company trust_cik = filer_cik # Get more information on company middle_tr = tr.find_next_sibling("tr", attrs={'class', 'blue'}) company_strings = middle_tr.select(".normalbold") if len(company_strings) > 1: # If two company names below headline a1_company = company_strings[0].text.split("(")[0].strip() a2_company = company_strings[1].text.split("(")[0].strip() # Cik can be 6 or 7 digits a1_cik = re.findall(r'\d{6,8}', company_strings[0].text)[0] a2_cik = re.findall(r'\d{6,8}', company_strings[1].text)[0] # Trust's cik is always greater than filer's if int(a1_cik) > int(a2_cik): trust_company = a1_company trust_cik = a1_cik else: trust_company = a2_company trust_cik = a2_cik else: # If only one company's name below headline, dig deeper parent_filing_href = tr.find_next_sibling("tr", attrs={'class', 'infoBorder'})\ .select("td.footer a.clsBlueBg")[0].attrs['href'] abs_url = parent_filing_href[parent_filing_href.find("(")+1:parent_filing_href.find(")")]\ .split(",")[0].strip("'") absee_page = self.load_page(abs_url) (abs_cik, abs_trust) = self.parse_absee(absee_page) if abs_cik is not None: trust_cik = abs_cik if abs_trust is not None: trust_company = abs_trust # Save company and filing data into db with IndexDb.get_session() as session: asset_type = None # Save trust company if does not exist tco = session.query(Company).get(trust_cik) if tco is None: preview = FileDownloader.preview_download(filing_url) match = re.search(r'absee/(\w+)/assetdata', preview) if match: asset_type = match.group(1) tco = Company(cik=trust_cik, name=trust_company, is_trust=True, asset_type=asset_type) session.add(tco) # Save filer company if does not exist if not filer_cik == trust_cik: fco = session.query(Company).get(filer_cik) if fco is None: fco = Company(cik=filer_cik, name=filer_company, is_trust=False, asset_type=asset_type) session.add(fco) # Save filing data filing = Filing(acc_no=acc_no, cik_filer=filer_cik, cik_trust=trust_cik, url=filing_url, date_filing=filing_date) # Save only if no database entry found for this accession no if session.query(Filing).get(acc_no) is None: session.add(filing) counter += 1 print(f'{ats()} Done with {filer_company}-{filer_cik} from {filing_date}...') return counter