Exemplo n.º 1
Arquivo: cli.py Projeto: iangow/hal
def _write(d):
    filing = Filing(
        url=Filing.HTTP_ROOT + d['folder']
    path = filing.md_path()
    print 'Writing:', path
    text = d['text']
    if not d['text_file']:
        text = clean(text)
    with open(path, 'w') as f:
Exemplo n.º 2
Arquivo: cli.py Projeto: iangow/hal
    def create_filing(self, url):
        if not hasattr(self, 'client'):
            self.client = sec_ftp.Client()

            filing = Filing.get(url)
            filing = Filing(url=url)
        folder = url.replace(Filing.HTTP_ROOT, '')

        d = self.client.load(folder)

        filing.type = d['type']
        filing.html = d['text']

        return filing
Exemplo n.º 3
Arquivo: cli.py Projeto: iangow/hal
def write_corpus():
    paths = [Filing.folder_to_md_path(folder) for folder in CORPUS_FOLDERS]
    not_written = [
        for i, p in enumerate(paths)
        if not os.path.exists(p)
    dictionaries = _load(not_written)
    map(_write, dictionaries)
Exemplo n.º 4
    def scrape_page(self, page=None, counter=0, saved_page=False):
        Scrape html for filings data and add them to db.
        :param page: html
        :param counter: technical var to keep track of processed filings
        :param saved_page: boolean indicating whether to use a saved web page (for debugging)
        :return: number of processed filings
        # Retrieved last saved page (if available) for debugging purposes or send a request for starting page
        if saved_page:
                with open(self.lastpage_path, 'r') as input_file:
                    page = input_file.read()
                page = self.load_page()

        soup = bs4.BeautifulSoup(page, features="html.parser")
        tables = soup.find_all("table", attrs={'xmlns:autn': "http://schemas.autonomy.com/aci/"})
        if len(tables) == 0:
            print(f'{ats()} Something\'s wrong. No search results have been found!')

        # Iterate through rows of the search results table
        for tr in tables[0].select("tr"):
            # Identify top row of each search result (skip rows with classes blue and infoBorder).
            if 'class' in tr.attrs and any(css_class in tr.attrs['class'] for css_class in ['infoBorder', 'blue']):

            tds = tr.select("td")
            # Transform filing date in "mm/dd/yyyy" format to date object
            date_str = tds[0].text
            date_arr = date_str.split("/")
            filing_date = date(year=int(date_arr[2]), month=int(date_arr[0]), day=int(date_arr[1]))

            title_links = tds[1].select('a')
            if len(title_links) > 1:
                filing_title = title_links[0].text
                filing_type = filing_title[:6].strip()  # should be either EX-102 or EX-103
                # Skip EX-103 exhibits
                if filing_type == 'EX-103':

                filing_url = title_links[1].attrs['href'].strip()
                acc_no = filing_url.split("/")[7].strip()
                filer_cik = filing_url.split("/")[6].strip()  # headline company cik
                filer_company = filing_title[21:].strip()  # company name as appears in the headline
                trust_company = filer_company
                trust_cik = filer_cik
                # Get more information on company
                middle_tr = tr.find_next_sibling("tr", attrs={'class', 'blue'})
                company_strings = middle_tr.select(".normalbold")

                if len(company_strings) > 1:
                    # If two company names below headline
                    a1_company = company_strings[0].text.split("(")[0].strip()
                    a2_company = company_strings[1].text.split("(")[0].strip()
                    # Cik can be 6 or 7 digits
                    a1_cik = re.findall(r'\d{6,8}', company_strings[0].text)[0]
                    a2_cik = re.findall(r'\d{6,8}', company_strings[1].text)[0]
                    # Trust's cik is always greater than filer's
                    if int(a1_cik) > int(a2_cik):
                        trust_company = a1_company
                        trust_cik = a1_cik
                        trust_company = a2_company
                        trust_cik = a2_cik
                    # If only one company's name below headline, dig deeper
                    parent_filing_href = tr.find_next_sibling("tr", attrs={'class', 'infoBorder'})\
                        .select("td.footer a.clsBlueBg")[0].attrs['href']
                    abs_url = parent_filing_href[parent_filing_href.find("(")+1:parent_filing_href.find(")")]\
                    absee_page = self.load_page(abs_url)
                    (abs_cik, abs_trust) = self.parse_absee(absee_page)
                    if abs_cik is not None:
                        trust_cik = abs_cik
                    if abs_trust is not None:
                        trust_company = abs_trust

                # Save company and filing data into db
                with IndexDb.get_session() as session:
                    asset_type = None
                    # Save trust company if does not exist
                    tco = session.query(Company).get(trust_cik)
                    if tco is None:
                        preview = FileDownloader.preview_download(filing_url)
                        match = re.search(r'absee/(\w+)/assetdata', preview)
                        if match:
                            asset_type = match.group(1)
                        tco = Company(cik=trust_cik, name=trust_company, is_trust=True, asset_type=asset_type)
                    # Save filer company if does not exist
                    if not filer_cik == trust_cik:
                        fco = session.query(Company).get(filer_cik)
                        if fco is None:
                            fco = Company(cik=filer_cik, name=filer_company, is_trust=False, asset_type=asset_type)
                    # Save filing data
                    filing = Filing(acc_no=acc_no,
                    # Save only if no database entry found for this accession no
                    if session.query(Filing).get(acc_no) is None:

                    counter += 1
                    print(f'{ats()} Done with {filer_company}-{filer_cik} from {filing_date}...')

        return counter