def parse_section_header(sec): h = {} h['title'] = cleantext(sec.cssselect('h3')[0].text) _p = sec.cssselect('p')[0] _r = list(_p)[0] h['prompt'] = cleantext(_p.text) h['response'] = cleantext(_r.text) return h
def parsec_01(sec): data = {} data['title'] = sec.cssselect('h3')[0].text _p = sec.cssselect('p')[0] _r = list(_p)[0] data['prompt'] = cleantext(_p.text) data['response'] = cleantext(_r.text) data['records'] = extract_table(sec) return data
def table_records_default_parser(sec): records = [] table = sec.find('*/table') if table is not None: headers = [slugify(e.text) for e in table.cssselect('.header')[0]] # [None, 'Asset', 'Asset Type', 'Owner', 'Value', 'Income Type', 'Income'] headers[0] = 'id' for r in table.cssselect('tr')[1:]: records.append({ headers[i]: cleantext(r[i].text_content()) for i in range(len(r)) }) return records
def classify_doc_meta(title, url): title = cleantext(title) d = { 'doc_type': None, 'doc_subtype': None, 'amendment_number': None, 'extension_number': None, 'doc_filetype': None, } # first, the doc_filetype if '/view/paper' in url: d['doc_filetype'] = 'paper' else: d['doc_filetype'] = 'html' ####################################### # now, check if it's an amendment if 'Amendment' in title: # extract an amendment number _ax = re.search(r'Amendment *(\d+)', title) d['amendment_number'] = int(_ax.groups()[0]) if _ax else 1 # only fdreports and extensions have subtypes d['doc_type'] = _classify_doc_type(title, url) if d['doc_type'] in ['Financial Disclosure Report', 'Extension Notice']: d['doc_subtype'] = _subclassify_fdreport(title) if d['doc_type'] == 'Extension Notice': # extract the extension number _ex = re.search(r'Extension *(\d+)', title) # presumably non-marked extensions are the first of their kind d['extension_number'] = int(_ex.groups()[0]) if _ex else 1 d['doc_related_date'] = _extract_related_date(title) return d
def parse_single_raw_record(record): d = {k: cleantext(foo(record)) for k, foo in PARSED_HEADERS.items()} d.update(classify_doc_meta(d['doc_title'], d['doc_url'])) for k, v in d.items(): d[k] = str(v) if v else '' return d