def parse_section_header(sec):
    h = {}
    h['title'] = cleantext(sec.cssselect('h3')[0].text)
    _p = sec.cssselect('p')[0]
    _r = list(_p)[0]
    h['prompt'] = cleantext(_p.text)
    h['response'] = cleantext(_r.text)
    return h
def parsec_01(sec):
    data = {}

    data['title'] = sec.cssselect('h3')[0].text
    _p = sec.cssselect('p')[0]
    _r = list(_p)[0]
    data['prompt'] = cleantext(_p.text)
    data['response'] = cleantext(_r.text)
    data['records'] = extract_table(sec)
    return data
def table_records_default_parser(sec):
    records = []
    table = sec.find('*/table')
    if table is not None:
        headers = [slugify(e.text) for e in table.cssselect('.header')[0]]
        # [None, 'Asset', 'Asset Type', 'Owner', 'Value', 'Income Type', 'Income']
        headers[0] = 'id'
        for r in table.cssselect('tr')[1:]:
            records.append({
                headers[i]: cleantext(r[i].text_content())
                for i in range(len(r))
            })
    return records
示例#4
0
def classify_doc_meta(title, url):
    title = cleantext(title)
    d = {
        'doc_type': None,
        'doc_subtype': None,
        'amendment_number': None,
        'extension_number': None,
        'doc_filetype': None,
    }

    # first, the doc_filetype
    if '/view/paper' in url:
        d['doc_filetype'] = 'paper'
    else:
        d['doc_filetype'] = 'html'

    #######################################
    # now, check if it's an amendment
    if 'Amendment' in title:
        # extract an amendment number
        _ax = re.search(r'Amendment *(\d+)', title)
        d['amendment_number'] = int(_ax.groups()[0]) if _ax else 1

    # only fdreports and extensions have subtypes
    d['doc_type'] = _classify_doc_type(title, url)

    if d['doc_type'] in ['Financial Disclosure Report', 'Extension Notice']:
        d['doc_subtype'] = _subclassify_fdreport(title)

    if d['doc_type'] == 'Extension Notice':
        # extract the extension number
        _ex = re.search(r'Extension *(\d+)', title)
        # presumably non-marked extensions are the first of their kind
        d['extension_number'] = int(_ex.groups()[0]) if _ex else 1

    d['doc_related_date'] = _extract_related_date(title)

    return d
示例#5
0
def parse_single_raw_record(record):
    d = {k: cleantext(foo(record)) for k, foo in PARSED_HEADERS.items()}
    d.update(classify_doc_meta(d['doc_title'], d['doc_url']))
    for k, v in d.items():
        d[k] = str(v) if v else ''
    return d