def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] bill.add_sponsor('primary', author.strip()) action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content() bill.add_action(chamber, action, date) # also has an associated version if tds[1].xpath('a'): bill.add_version(action, tds[1].xpath('a/@href')[0]) bill.add_source(url) self.save_bill(bill)
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt" page = self.urlopen(url).decode('latin-1') page = unicode_csv_reader(StringIO.StringIO(page), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial resolution', 'CMR': 'concurrent memorial resolution'}[type_spec] bill = Bill('2011', chamber, bill_id, row[3], type=bill_type) bill.add_source(url) bill.add_sponsor('lead sponsor', row[11]) version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % ( session, bill_id.replace(' ', ''))) bill.add_version(bill_id, version_url) self.scrape_votes(bill) self.bills[bill_id] = bill
def scrape2003(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm" with self.lxml_context(url) as page: # Grab the interesting tables on the page. tables = page.cssselect('center table') # Bill name = tables[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Sponsorships for a in tables[1].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions center = page.cssselect('center table center')[0] for row in center.cssselect('table')[-2].cssselect('tr')[2:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue if action_text.startswith('Senate'): bill.add_action('upper', action_text, date) elif action_text.startswith('House'): bill.add_action('lower', action_text, date) # Versions for row in center.cssselect('table')[-1].cssselect('a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href'))) self.save_bill(bill)
def scrape2001(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2001_02/sum/sb1.htm" with self.lxml_context(url) as page: # Grab the interesting tables on the page. tables = page.cssselect("table center table") # Bill name = tables[0].text_content().split("-", 1)[1] bill = Bill(session, chamberName, number, name) # Sponsorships for a in tables[1].cssselect("a"): bill.add_sponsor("", a.text_content().strip()) # Actions center = page.cssselect("table center")[-1] for row in center.cssselect("table table")[0].cssselect("tr")[2:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if "/" not in date: continue if action_text.startswith("Senate"): action_text = action_text.split(" ", 1)[1].strip() bill.add_action("upper", action_text, date) elif action_text.startswith("House"): action_text = action_text.split(" ", 1)[1].strip() bill.add_action("lower", action_text, date) # Versions for row in center.cssselect("table table")[1].cssselect("a"): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get("href"))) self.save_bill(bill)
def scrape1999(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm" with self.lxml_context(url) as lxml: # Grab the interesting tables on the page. tables = page.cssselect("table") # Bill name = tables[1].cssselect("a")[0].text_content().split("-", 1)[1] bill = Bill(session, chamberName, number, name) # Versions bill.add_version("Current", url.replace("/sum/", "/fulltext/")) # Sponsorships for a in tables[2].cssselect("a"): bill.add_sponsor("", a.text_content().strip()) # Actions for row in tables[-1].cssselect("tr"): senate_date = row[0].text_content().strip() action_text = row[1].text_content().strip() house_date = row[2].text_content().strip() if "/" not in senate_date and "/" not in house_date: continue if senate_date: bill.add_action("upper", action_text, senate_date) if house_date: bill.add_action("lower", action_text, house_date) self.save_bill(bill)
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt" page = self.urlopen(url).decode("latin-1") page = unicode_csv_reader(StringIO.StringIO(page), delimiter="|") for row in page: bill_chamber = {"H": "lower", "S": "upper"}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r"(H|S)([A-Z]+)\s", bill_id).group(2) bill_type = { "B": "bill", "R": "resolution", "JR": "joint resolution", "CR": "concurrent resolution", "MR": "memorial resolution", "CMR": "concurrent memorial resolution", }[type_spec] if row[-1] != self.slug: continue bill = Bill(session, chamber, bill_id, row[3], type=bill_type) bill.add_source(url) bill.add_sponsor("lead sponsor", row[11]) version_url = "ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % (session, bill_id.replace(" ", "")) bill.add_version(bill_id, version_url) self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape_bill(self, session, chamber, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath("//br")[8].tail if not title: return title = title.strip() bill = Bill(session, chamber, bill_id, title) bill.add_source(url) action_link = page.xpath("//a[contains(@href, 'getActions')]")[0] self.scrape_actions(bill, action_link.attrib['href']) version_path = "//a[contains(., '%s')]" for version_type in ('Introduced Bill', 'House Bill', 'Senate Bill', 'Engrossed Bill', 'Enrolled Act'): path = version_path % version_type links = page.xpath(path) if links: bill.add_version(version_type, links[0].attrib['href']) for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"): num = doc_link.text.strip().split("(")[0] bill.add_document("Fiscal Impact Statement #%s" % num, doc_link.attrib['href']) bill['subjects'] = self.subjects[bill_id] self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.urlopen(url)) except scrapelib.HTTPError as e: self.warning("error (%s) fetching %s, skipping" % (e, url)) return title = page.xpath("string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if "JR" in bill_id: bill_type = ["joint resolution"] elif "CR" in bill_id: bill_type = ["concurrent resolution"] elif "R" in bill_id: bill_type = ["resolution"] else: bill_type = ["bill"] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill["subjects"] = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if "otherAuth" in link.attrib["id"]: bill.add_sponsor("coauthor", name) else: bill.add_sponsor("author", name) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == "None": continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == "H": actor = "lower" elif actor == "S": actor = "upper" bill.add_action(actor, action, date, type=action_type(action)) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] for link in version_table.xpath(".//a[contains(@href, '.DOC')]"): version_url = link.attrib["href"] if "COMMITTEE REPORTS" in version_url: continue name = link.text.strip() bill.add_version(name, version_url) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): self.scrape_votes(bill, urlescape(link.attrib["href"])) self.save_bill(bill)
def scrape2009(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm" with self.lxml_context(url) as page: # Bill name = page.cssselect('#legislation h1')[0].text_content().strip() bill_id = name.split(' - ')[0].strip() bill = Bill(session, chamberName, bill_id, name) # Sponsorships for a in page.cssselect("#sponsors a"): bill.add_sponsor('', a.text_content().strip()) # Actions for row in page.cssselect('#history tr')[1:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue date = datetime.datetime.strptime(date, '%m/%d/%Y') if action_text.startswith('Senate'): bill.add_action('upper', action_text, date) elif action_text.startswith('House'): bill.add_action('lower', action_text, date) # Versions for row in page.cssselect('#versions a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href'))) self.save_bill(bill)
def scrape(self, session, chambers): url = 'http://www.ontla.on.ca/web/bills/bills_all.do?locale=en&parlSessionID=%s' % session html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath('//table/tr'): id, title_td, sponsor = row.xpath('td') bill_id = id.text_content().strip() title = clean_spaces(title_td.text_content()) # pull sponsor off different page bill = Bill(session, 'lower', bill_id, title) # skip to detail page detail_url = title_td.xpath('a/@href')[0] + "&detailPage=bills_detail_status" bill.add_source(url) bill.add_source(detail_url) # get actions & sponsors self.scrape_details(bill, detail_url) if not bill['versions']: self.warning('no versions detected via normal method, using ' 'top-level page') bill.add_version('Original (current version)', title_td.xpath('a/@href')[0], mimetype='text/html') self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id): biennium = "%s-%s" % (session[0:4], session[7:9]) bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, biennium, bill_num)) with self.urlopen(url) as page: page = lxml.etree.fromstring(page).xpath("//wa:Legislation", namespaces=self._ns)[0] title = page.xpath("string(wa:LongDescription)", namespaces=self._ns) bill_type = page.xpath( "string(wa:ShortLegislationType/wa:LongLegislationType)", namespaces=self._ns).lower() if bill_type == 'gubernatorial appointment': return bill = Bill(session, chamber, bill_id, title, type=[bill_type]) chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber] version_url = ("http://www.leg.wa.gov/pub/billinfo/2011-12/" "Htm/Bills/%s %ss/%s.htm" % (chamber_name, bill_type.title(), bill_num)) bill.add_version(bill_id, version_url) self.scrape_sponsors(bill) self.scrape_actions(bill) self.save_bill(bill)
def _parse_bill(self, session, chamber, source_url, line): if line: (type, combined_id, number, title, relating_to) = line.split("\xe4") if (type == 'HB' and chamber == 'lower') or (type == 'SB' and chamber == 'upper'): # # basic bill info bill_id = "%s %s" % (type, number.zfill(4)) bill = Bill(session, chamber, bill_id, title) bill.add_source(source_url) # # add actions if self.actionsByBill.has_key(bill_id): for a in self.actionsByBill[bill_id]: bill.add_action(a['actor'], a['action'], a['date']) if self.load_versions_sponsors: # add versions and sponsors versionsSponsors = self.versionsSponsorsParser.fetch_and_parse(self, session, bill_id) #print "versionsSponsors: %s" % str(versionsSponsors) if versionsSponsors: for ver in versionsSponsors['versions']: bill.add_version(ver['name'], ver['url']) sponsorType = 'primary' if len(versionsSponsors['sponsors']) > 1: sponsorType = 'cosponsor' for name in versionsSponsors['sponsors']: bill.add_sponsor(sponsorType, name) # save - writes out JSON self.save_bill(bill)
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] if chamber == 'legislature': chamber = 'upper' bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id(action['organization_id'])['classification'] legislators = [] committees = [] for rel in action['related_entities']: if rel['entity_type'] == 'organization': committees.append(rel['name']) elif rel['entity_type'] == 'person': legislators.append(rel['name']) bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification']), committees=committees, legislators=legislators, **action.get('extras', {}), ) for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor(sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date']), **version.get('extras', {})) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date']), **doc.get('extras', {})) for title in data['other_titles']: bill.add_title(title['title']) for related in data['related_bills']: bill.add_companion(related['identifier'], related['legislative_session'], chamber ) self.save_bill(bill)
def scrape1999(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm" with self.lxml_context(url) as lxml: # Grab the interesting tables on the page. tables = page.cssselect('table') # Bill name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/')) # Sponsorships for a in tables[2].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions for row in tables[-1].cssselect('tr'): senate_date = row[0].text_content().strip() action_text = row[1].text_content().strip() house_date = row[2].text_content().strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.save_bill(bill)
def scrape(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billindex/" "BillCrossRef.aspx?type=%s" % (session, chamber_abbrev)) page = lxml.html.fromstring(self.urlopen(url)) for tr in page.xpath("//tr[@valign='middle']")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, title, type=bill_type) self.scrape_digest(bill) # versions for a in (tr.xpath('td[6]//a') + tr.xpath('td[9]//a') + tr.xpath('td[10]//a')): bill.add_version(a.text, a.get('href')) # documents fnote = tr.xpath('td[7]//a') if fnote: bill.add_document('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[12]//a') if summary: bill.add_document('Summary', summary[0].get('href')) bill.add_source(url) self.save_bill(bill)
def scrape_assem_bills(self, chamber, insert, session, year): doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution', 6: 'joint resolution'} for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) with self.urlopen(page_path) as page: page = page.decode("utf8").replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)') if insert.find('Special') != -1: session = insert bill = Bill(session, chamber, bill_id, title, type=bill_type) bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url) primary, secondary = self.scrape_sponsors(page) if primary[0] == 'By:': primary.pop(0) if primary[0] == 'ElectionsProceduresEthicsand': primary[0] = 'Elections Procedures Ethics and' full_name = '' for part_name in primary: full_name = full_name + part_name + " " bill.add_sponsor('primary', full_name) else: for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "lower") self.scrape_votes(page, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': other_chamber = 'lower' bill_id = 'SB 1' else: other_chamber = 'upper' bill_id = 'HB 1' b1 = Bill(session, chamber, bill_id, 'A super bill') b1.add_source('http://example.com/') b1.add_version('As Introduced', 'http://example.com/SB1.html') b1.add_document('Google', 'http://google.com') b1.add_sponsor('primary', 'Bob Smith') b1.add_sponsor('secondary', 'Johnson, Sally') d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y') v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0) v1.yes('Smith') v1.yes('Johnson') d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y') v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1) v2.no('Bob Smith') v2.other('S. Johnson') b1.add_vote(v1) b1.add_vote(v2) b1.add_action(chamber, 'introduced', d1) b1.add_action(chamber, 'read first time', d2) b1.add_action(other_chamber, 'introduced', d2) self.save_bill(b1)
def scrape(self, session, chambers): # Get the progress table. url = 'http://www.assembly.nl.ca/business/bills/ga47session1.htm' doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) for tr in doc.xpath('//table[@class="bills"]/tr')[1:]: bill_id = clean_spaces(tr[0].text_content()).strip('*') if not bill_id: break # empty rows extend past actual list of bills if bill_id.endswith("."): bill_id = bill_id[:-1] title = clean_spaces(tr[1].text_content()) chapter = tr[-1].text_content() bill = Bill(session, 'lower', bill_id, title, type='bill') if chapter: bill['chapter'] = chapter # FIXME need to do more work to figure out what # version the text *really* is td = tr[1] bill_url = td.xpath('a/@href') if bill_url: bill.add_version(url=bill_url.pop(), name='First Reading', mimetype='text/html') # Actions and version urls. data = zip([ 'First Reading', 'Second Reading', 'Committee', 'Amendments', 'Third Reading', 'Royal Assent', 'Act'], tr[2:-1]) for action, td in data: date_text = td.text_content() date = None fmt = r'%b. %d/%Y' try: date = datetime.datetime.strptime(date_text, fmt) except ValueError: continue else: break if date is None: continue attrs = dict(action=action, date=date, actor='lower') attrs.update(self.categorizer.categorize(action)) bill.add_action(**attrs) bill.add_source(url) self.save_bill(bill)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ if chamber == "House": chamber = 'lower' else: chamber = 'upper' with self.urlopen(bill_detail_url) as bill_html: doc = lxml.html.fromstring(bill_html) bill_id = doc.xpath('//title/text()')[0].split()[0] bill_title = doc.xpath('//font[@size=-1]/text()')[0] bill_type = {'F': 'bill', 'R':'resolution', 'C': 'concurrent resolution'}[bill_id[1]] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) bill['subjects'] = self._subject_mapping[bill_id] bill.add_source(bill_detail_url) # grab sponsors sponsors = doc.xpath('//table[@summary="Show Authors"]/descendant::a/text()') if sponsors: primary_sponsor = sponsors[0].strip() bill.add_sponsor('primary', primary_sponsor, chamber=chamber) cosponsors = sponsors[1:] for leg in cosponsors: bill.add_sponsor('cosponsor', leg.strip(), chamber=chamber) # Add Actions performed on the bill. bill_actions = self.extract_bill_actions(doc, chamber) for action in bill_actions: kwargs = {} if 'committee' in action: kwargs['committees'] = action['committees'] bill.add_action(action['action_chamber'], action['action_text'], action['action_date'], type=action['action_type'], **kwargs) # Get all versions of the bill. # Versions of a bill are on a separate page, linked to from the column # labeled, "Bill Text", on the search results page. with self.urlopen(version_list_url) as version_html: if 'resolution' in version_html.response.url: bill.add_version('resolution text', version_html.response.url, mimetype='text/html') else: version_doc = lxml.html.fromstring(version_html) for v in version_doc.xpath('//a[starts-with(@href, "/bin/getbill.php")]'): version_url = urlparse.urljoin(VERSION_URL_BASE, v.get('href'), mimetype='text/html') bill.add_version(v.text.strip(), version_url) self.save_bill(bill)
def scrape_xml(self, chamber, session): start_letter = "S" if chamber == "upper" else "H" sponsor_type_dict = {"3": "senate cosponsor", "4": "sponsor", "5": "sponsor"} version_url = "http://www1.legis.ga.gov/legis/%s/versions/" % session summary_url = "http://www1.legis.ga.gov/legis/%s/list/BillSummary.xml" % session xml = self.urlopen(summary_url) doc = lxml.etree.fromstring(xml) for bxml in doc.xpath("//Bill"): type = bxml.get("Type") # if this is from the other chamber skip it if not type.startswith(start_letter): continue bill_id = type + bxml.get("Num") + bxml.get("Suffix") if type in ("HB", "SB"): type = "bill" elif type in ("HR", "SR"): type = "resolution" else: raise ValueError("unknown type: %s" % type) # use short_title as title and long as description title = bxml.xpath("Short_Title/text()")[0] description = bxml.xpath("Title/text()")[0] bill = Bill(session, chamber, bill_id, title, type=type, description=description) bill.add_source(summary_url) for sponsor in bxml.xpath("Sponsor"): sponsor_name, code = sponsor.text.rsplit(" ", 1) sponsor_name = sponsor_name.replace(",", ", ") bill.add_sponsor(sponsor_type_dict[sponsor.get("Type")], sponsor_name, _code=code) for version in bxml.xpath("Versions/Version"): # NOTE: it is possible to get PDF versions by using .get('Id') # ex. URL: legis.ga.gov/Legislation/20112012/108025.pdf # for now we just get HTML description, file_id = version.xpath("*/text()") bill.add_version(description, version_url + file_id) for action in bxml.xpath("StatusHistory/Status"): date = datetime.datetime.strptime(action.get("StatusDate"), "%Y-%m-%dT%H:%M:%S") code = action.get("StatusCode") if code in ("EFF", "Signed Gov"): actor = "executive" elif code[0] == "S": actor = "upper" elif code[0] == "H": actor = "lower" atype = self._action_codes[code] bill.add_action(actor, action.text, date, atype) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id): biennium = "%s-%s" % (session[0:4], session[7:9]) bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, biennium, bill_num)) page = self.urlopen(url) page = lxml.etree.fromstring(page.bytes) page = xpath(page, "//wa:Legislation")[0] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(session, chamber, bill_id, title, type=[bill_type]) fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % ( bill_num, session[0:4])) bill.add_source(fake_source) chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber] mimetype = 'text/html' version_url = ("http://www.leg.wa.gov/pub/billinfo/%s/" "Htm/Bills/%s %ss/%s.htm" % (biennium, chamber_name, bill_type.title(), bill_num)) # Sometimes the measure's version_url isn't guessable. When that happens # have to get the url from the source page. try: version_resp = self.get(version_url) if version_resp.status_code != 200: webpage = self.get(fake_source).text webdoc = lxml.html.fromstring(webpage) version_url = webdoc.xpath('//a[contains(@href, "billdocs")]/@href')[-1] if version_url.lower().endswith('.pdf'): mimetype = 'application/pdf' except scrapelib.HTTPError: pass bill.add_version(bill_id, version_url, mimetype=mimetype) self.scrape_sponsors(bill) self.scrape_actions(bill, bill_num) self.scrape_votes(bill) self.fix_prefiled_action_dates(bill) return bill
def scrape_bill(self, chamber, session, bill_id, url): page = lxml.html.fromstring(self.urlopen(url)) title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if 'JR' in bill_id: bill_type = ['joint resolution'] elif 'CR' in bill_id: bill_type = ['concurrent resolution'] elif 'R' in bill_id: bill_type = ['resolution'] else: bill_type = ['bill'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if 'otherAuth' in link.attrib['id']: bill.add_sponsor('coauthor', name) else: bill.add_sponsor('author', name) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == 'None': continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == 'H': actor = 'lower' elif actor == 'S': actor = 'upper' bill.add_action(actor, action, date, type=action_type(action)) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] for link in version_table.xpath(".//a[contains(@href, '.DOC')]"): version_url = link.attrib['href'] if 'COMMITTEE REPORTS' in version_url: continue name = link.text.strip() bill.add_version(name, version_url) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): self.scrape_votes(bill, urlescape(link.attrib['href'])) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, short_title, url): if bill_id in ['SCR 0003', 'SB 0251', 'SB 0292']: return with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) # check for Bill Withdrawn header h1text = page.xpath('//h1/text()') if h1text and h1text[0] == 'Bill Withdrawn': return title = page.xpath("//br")[8].tail if not title: title = short_title title = title.strip() abbrev = bill_id.split()[0] if abbrev.endswith('B'): bill_type = ['bill'] elif abbrev.endswith('JR'): bill_type = ['joint resolution'] elif abbrev.endswith('CR'): bill_type = ['concurrent resolution'] elif abbrev.endswith('R'): bill_type = ['resolution'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) action_link = page.xpath("//a[contains(@href, 'getActions')]")[0] self.scrape_actions(bill, action_link.attrib['href']) version_path = "//a[contains(., '%s')]" for version_type in ('Introduced Bill', 'House Bill', 'Senate Bill', 'Engrossed Bill', 'Enrolled Act'): path = version_path % version_type links = page.xpath(path) if links: bill.add_version(version_type, links[0].attrib['href']) for vote_link in page.xpath("//a[contains(@href, 'Srollcal')]"): self.scrape_senate_vote(bill, vote_link.attrib['href']) for vote_link in page.xpath("//a[contains(@href, 'Hrollcal')]"): self.scrape_house_vote(bill, vote_link.attrib['href']) for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"): num = doc_link.text.strip().split("(")[0] bill.add_document("Fiscal Impact Statement #%s" % num, doc_link.attrib['href']) bill['subjects'] = self.subjects[bill_id] self.save_bill(bill)
def scrape_bill(self, session, bill_id, chamber): #https://malegislature.gov/Bills/189/SD2739 session_for_url = self.replace_non_digits(session) bill_url = u'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id) print bill_url try: response = requests.get(bill_url) except requests.exceptions.RequestException as e: self.warning(u'Server Error on {}'.format(bill_url)) return False html = response.text page = lxml.html.fromstring(html) if page.xpath('//div[contains(@class, "followable")]/h1/text()'): bill_number = page.xpath('//div[contains(@class, "followable")]/h1/text()')[0] else: self.warning(u'Server Error on {}'.format(bill_url)) return False bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0] bill_summary = '' if page.xpath('//p[@id="pinslip"]/text()'): bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0] bill_id = re.sub(r'[^S|H|\d]','',bill_id) bill = Bill(session, chamber,bill_id, bill_title, summary=bill_summary) bill.add_source(bill_url) #https://malegislature.gov/Bills/189/SD2739 has a presenter #https://malegislature.gov/Bills/189/S2168 no sponsor # Find the non-blank text of the dt following Sponsor or Presenter, # including any child link text. sponsor = page.xpath('//dt[text()="Sponsor:" or text()="Presenter:"]/following-sibling::dd/descendant-or-self::*/text()[normalize-space()]') if sponsor: sponsor = sponsor[0].strip() bill.add_sponsor('primary', sponsor) has_cosponsor = page.xpath('//a[starts-with(normalize-space(.),"Petitioners")]') if has_cosponsor: self.scrape_cosponsors(bill, bill_url) version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/a[contains(text(), 'Download PDF') and not(@disabled)]/@href") if version: version_url = "https://malegislature.gov{}".format(version[0]) bill.add_version('Bill Text', version_url, mimetype='application/pdf') self.scrape_actions(bill, bill_url) self.save_bill(bill)
def scrape_bill_page(self, chamber, session, bill_url, bill_type): page = self.lxmlize(bill_url) author = self.get_one_xpath(page, "//a[@id='ctl00_PageBody_LinkAuthor']/text()") sbp = lambda x: self.scrape_bare_page(page.xpath("//a[contains(text(), '%s')]" % (x))[0].attrib["href"]) authors = [x.text for x in sbp("Authors")] try: digests = sbp("Digests") except IndexError: digests = [] try: versions = sbp("Text") except IndexError: versions = [] title = page.xpath("//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0] actions = page.xpath("//div[@id='ctl00_PageBody_PanelBillInfo']/" "/table[@style='font-size:small']/tr") bill_id = page.xpath("//span[@id='ctl00_PageBody_LabelBillID']/text()")[0] bill_type = {"B": "bill", "CR": "concurrent resolution"}[bill_type[1:]] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(bill_url) authors.remove(author) bill.add_sponsor("primary", author) for author in authors: bill.add_sponsor("cosponsor", author) for digest in digests: bill.add_document(digest.text, digest.attrib["href"], mimetype="application/pdf") for version in versions: bill.add_version(version.text, version.attrib["href"], mimetype="application/pdf") flags = {"prefiled": ["bill:filed"], "referred to the committee": ["committee:referred"]} for action in actions: date, chamber, page, text = [x.text for x in action.xpath(".//td")] date += "/%s" % (session) # Session is April --> June. Prefiles # look like they're in January at earliest. date = dt.datetime.strptime(date, "%m/%d/%Y") chamber = {"S": "upper", "H": "lower", "J": "joint"}[chamber] cat = [] for flag in flags: if flag in text.lower(): cat += flags[flag] if cat == []: cat = ["other"] bill.add_action(chamber, text, date, cat) self.save_bill(bill)
def scrape(self, chamber, session): self.log(self.metadata['session_details']) self.site_id = self.metadata['session_details'][session]['internal_id'] chamber_piece = {'upper': 'Senate', 'lower': 'House+of+Representatives'}[chamber] # resolutions # http://alisondb.legislature.state.al.us/acas/SESSResosBySelectedMatterTransResults.asp?WhichResos=Senate&TransCodes={All}&LegDay={All}%22&GetBillsTrans=Get+Resolutions+by+Transaction url = 'http://alisondb.legislature.state.al.us/acas/SESSBillsBySelectedMatterTransResults.asp?TransCodes={All}&LegDay={All}&WhichBills=%s' % chamber_piece cookie = self.refresh_session() agent = FakeFirefoxURLopener() agent.addheader('Cookie', cookie) page = agent.open(url) doc = lxml.html.fromstring(page.read()) # bills are all their own table with cellspacing=4 (skip first) bill_tables = doc.xpath('//table[@cellspacing="4"]') for bt in bill_tables[1:]: # each table has 3 rows: detail row, description, blank details, desc, _ = bt.xpath('tr') # first <tr> has img, button, sponsor, topic, current house # current status, committee, committee2, last action _, button, sponsor, topic, _, _, com1, com2, _ = details.xpath('td') # pull bill_id out of script tag (gross) bill_id = bill_id_re.search(button.text_content()).group() self.log(bill_id) oid = btn_re.search(button.text_content()).groups()[0] sponsor = sponsor.text_content() topic = topic.text_content() com1 = com1.text_content() com2 = com2.text_content() desc = desc.text_content() # create bill bill = Bill(session, chamber, bill_id, desc.strip(), topic=topic) bill.add_sponsor(sponsor, 'primary') self.get_sponsors(bill, oid) self.get_actions(bill, oid) # craft bill URL session_fragment = '2010rs' type_fragment = 'bills' bill_id_fragment = bill_id.lower() bill_text_url = 'http://alisondb.legislature.state.al.us/acas/searchableinstruments/%s/%s/%s.htm' % ( session_fragment, type_fragment, bill_id_fragment) bill.add_version('bill text', bill_text_url) self.save_bill(bill)
def scrape_bills(self, chamber, session, subjects): idex = bill_start_numbers(session)[chamber] FROM="ctl00$rilinContent$txtBillFrom" TO="ctl00$rilinContent$txtBillTo" YEAR="ctl00$rilinContent$cbYear" blocks = "FOO" # Ugh. while len(blocks) > 0: default_headers = get_default_headers( SEARCH_URL ) default_headers[FROM] = idex default_headers[TO] = idex + MAXQUERY default_headers[YEAR] = session idex += MAXQUERY #headers = urllib.urlencode( default_headers ) blocks = self.parse_results_page(self.post(SEARCH_URL, data=default_headers).text) blocks = blocks[1:-1] blocks = self.digest_results_page(blocks) for block in blocks: bill = blocks[block] subs = [] try: subs = subjects[bill['bill_id']] except KeyError: pass title = bill['title'][len("ENTITLED, "):] billid = bill['bill_id'] try: subs = subjects[bill['bill_id']] except KeyError: subs = [] for b in BILL_NAME_TRANSLATIONS: if billid[:len(b)] == b: billid = BILL_NAME_TRANSLATIONS[b] + \ billid[len(b)+1:].split()[0] b = Bill(session, chamber, billid, title, type=self.get_type_by_name(bill['bill_id']), subjects=subs ) self.process_actions( bill['actions'], b ) sponsors = bill['sponsors'][len("BY"):].strip() sponsors = sponsors.split(",") sponsors = [ s.strip() for s in sponsors ] for href in bill['bill_id_hrefs']: b.add_version( href.text, href.attrib['href'], mimetype="application/pdf" ) for sponsor in sponsors: b.add_sponsor("primary", sponsor) b.add_source( SEARCH_URL ) self.save_bill(b)
def scrape_current(self, chamber, term): chamber_name = "Senate" if chamber == "upper" else "House" with self.urlopen( ksapi.url + "bill_status/" ) as bill_request: # perhaps we should save this data so we can make on request for both chambers? bill_request_json = json.loads(bill_request) bills = bill_request_json["content"] for bill_data in bills: # filtering out other chambers bill_equal_chamber = False for history in bill_data["HISTORY"]: if history["chamber"] == chamber_name: bill_is_in_chamber = True if not bill_is_in_chamber: continue # main bill = Bill(term, chamber, bill_data["BILLNO"], bill_data["SHORTTITLE"]) bill.add_source(ksapi.url + "bill_status/" + bill_data["BILLNO"].lower()) if bill_data["LONGTITLE"]: bill.add_title(bill_data["LONGTITLE"]) bill.add_document("apn", ksapi.ksleg + bill_data["apn"]) bill.add_version("Latest", ksapi.ksleg + bill_data["apn"]) for sponsor in bill_data["SPONSOR_NAMES"]: bill.add_sponsor("primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor", sponsor) for event in bill_data["HISTORY"]: if "committee_names" in event and "conferee_names" in event: actor = " and ".join(bill_data["committee_names"] + bill_data["conferee_names"]) elif "committee_names" in history: actor = " and ".join(bill_data["committee_names"]) elif "conferee_names" in history: actor = " and ".join(bill_data["conferee_names"]) else: actor = "upper" if chamber == "Senate" else "lower" date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S") bill.add_action(actor, event["status"], date) if event["action_code"] in ksapi.voted: votes = votes_re.match(event["status"]) if votes: vote = Vote( chamber, date, votes.group(1), event["action_code"] in ksapi.passed, int(votes.group(2)), int(votes.group(3)), 0, ) vote.add_source(ksapi.ksleg + "bill_status/" + bill_data["BILLNO"].lower()) bill.add_vote(vote) self.save_bill(bill)
def scrape_bill(self, chamber, term, bill_id, url, title, subject=None): self.logger.info('GET ' + url) resp = self.get(url) html = resp.text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) bill = Bill(term, chamber, bill_id, title) bill.add_source(url) if subject is not None: bill['subjects'] = [subject] # Sponsors sponsor_map = { 'author': 'primary', 'co-author': 'cosponsor', 'sponsor': 'cosponsor', 'co-sponsor': 'cosponsor', } for div in doc.xpath('//div[contains(@class, "bill-author-info")]'): name = div.xpath('string(b)').strip() sp_type = sponsor_map[div.xpath('string(p)').strip().lower()] bill.add_sponsor(sp_type, name) # Actions for li in doc.xpath('//div[@id="bill-actions"]//li')[::-1]: if li.text_content() == 'None currently available.': continue chamber_str = li.xpath('string(strong)').strip() action_chamber = dict(H='lower', S='upper')[chamber_str] action_date = li.xpath('string(span[@class="document-date"])') # Some resolution actions have no dates. if not action_date.strip(): continue action_date = datetime.datetime.strptime(action_date.strip(), '%m/%d/%Y') action_text = li.xpath('string(span[2])').strip() if not action_text.strip(): continue kwargs = dict(date=action_date, actor=action_chamber, action=action_text) kwargs.update(**self.categorizer.categorize(action_text)) bill.add_action(**kwargs) # Documents (including votes) for doc_type, doc_meta in BillDocuments(self, doc): if doc_type == 'version': bill.add_version( doc_meta.title or doc_meta.text, url=doc_meta.url, mimetype='application/pdf') elif doc_type == 'document': bill.add_document(doc_meta.title or doc_meta.text, url=doc_meta.url, mimetype='application/pdf') elif doc_type == 'rollcall': self.add_rollcall(chamber, bill, doc_meta) self.save_bill(bill)
def scrape_assem_bills(self, chamber, insert, session, year): doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution', 6: 'joint resolution',9:'petition'} for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.get(page_path).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) root.make_links_absolute("http://www.leg.state.nv.us/") bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = self.get_node( root, '//div[@id="content"]/table/tr[preceding-sibling::tr/td/' 'b[contains(text(), "By:")]]/td/em/text()') bill = Bill(session, chamber, bill_id, title, type=bill_type) bill['subjects'] = list(set(self.subject_mapping[bill_id])) billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext() text_urls = billtext.xpath("./a") for text_url in text_urls: version_name = text_url.text.strip() version_url = text_url.attrib['href'] bill.add_version(version_name, version_url, mimetype='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "lower") self.scrape_votes(page, page_path, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): try: page = self.urlopen(url) except scrapelib.HTTPError: self.warning("couldn't open %s, skipping bill" % url) return page = lxml.html.fromstring(page) page.make_links_absolute(url) header = page.xpath('//h3/br')[0].tail.replace(' ', ' ') title, primary_sponsor = header.split(' -- ') if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'): bill_type = ['bill'] elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'): bill_type = ['resolution'] elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'): bill_type = ['concurrent resolution'] elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'): bill_type = ['joint resolution'] for flag in SUB_BLACKLIST: if flag in bill_id: bill_id = bill_id.replace(flag, " ") bill_id = re.sub("\s+", " ", bill_id).strip() bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_sponsor('primary', primary_sponsor) bill.add_source(url) for link in page.xpath( '//a[contains(@href, "bills/") and text() = "HTML"]'): name = link.getprevious().tail.strip() bill.add_version(name, link.attrib['href'], mimetype="text/html") next = link.getnext() if next.text == "PDF": bill.add_version(name, next.attrib['href'], mimetype="application/pdf") for link in page.xpath( "//a[contains(@href, 'fnotes') and text() = 'HTML']"): bill.add_document("Fiscal Note", link.attrib['href']) subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill['subjects'] = subjects status_link = page.xpath('//a[contains(@href, "billsta")]')[0] self.parse_status(bill, status_link.attrib['href']) self.save_bill(bill)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ if chamber == "House": chamber = 'lower' else: chamber = 'upper' with self.urlopen(bill_detail_url) as bill_html: doc = lxml.html.fromstring(bill_html) bill_id = doc.xpath('//title/text()')[0].split()[0] bill_title = doc.xpath('//font[@size=-1]/text()')[0] bill_type = { 'F': 'bill', 'R': 'resolution', 'C': 'concurrent resolution' }[bill_id[1]] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) bill['subjects'] = self._subject_mapping[bill_id] bill.add_source(bill_detail_url) # grab sponsors sponsors = doc.xpath( '//table[@summary="Show Authors"]/descendant::a/text()') if sponsors: primary_sponsor = sponsors[0].strip() bill.add_sponsor('primary', primary_sponsor) cosponsors = sponsors[1:] for leg in cosponsors: bill.add_sponsor('cosponsor', leg.strip()) # Add Actions performed on the bill. bill_actions = self.extract_bill_actions(doc, chamber) for action in bill_actions: bill.add_action(action['action_chamber'], action['action_text'], action['action_date'], type=action['action_type']) # Get all versions of the bill. # Versions of a bill are on a separate page, linked to from the column # labeled, "Bill Text", on the search results page. with self.urlopen(version_list_url) as version_html: version_doc = lxml.html.fromstring(version_html) for v in version_doc.xpath( '//a[starts-with(@href, "/bin/getbill.php")]'): version_url = urlparse.urljoin(VERSION_URL_BASE, v.get('href')) bill.add_version(v.text.strip(), version_url) self.save_bill(bill)
def scrape_bill(self, chamber, session, billid, histurl, year): if year[0] != 'R': session = year else: session = self.metadata['session_details'][year]['sub_sessions'][ int(year[0]) - 1] with self.urlopen(histurl) as data: soup = BeautifulSoup(cleansource(data)) basicinfo = soup.findAll('div', id='bhistleft')[0] hist = basicinfo.table sponsor = None title = None for b in basicinfo.findAll('b'): if b.next.startswith('SUMMARY'): title = b.findNextSiblings(text=True)[0].strip() elif b.next.startswith('SPONSOR'): for a in b.findNextSiblings('a'): if not issponsorlink(a): break sponsor = cleansponsor(a.contents[0]) bill = Bill(session, chamber, billid, title) if sponsor: bill.add_sponsor('primary', sponsor) for row in hist.findAll('tr'): link = row.td.a vlink = urlbase % link['href'] vname = link.contents[0].strip() bill.add_version(vname, vlink) history = soup.findAll('div', id='bhisttab')[0].table rows = history.findAll('tr')[1:] for row in rows: tds = row.findAll('td') if len(tds) < 2: # This is not actually an action continue date, action = row.findAll('td')[:2] date = dt.datetime.strptime(date.contents[0], '%m/%d/%y') action = action.contents[0].strip() if 'House' in action: actor = 'lower' elif 'Senate' in action: actor = 'upper' else: # for lack of a better actor = chamber bill.add_action(actor, action, date) self.save_bill(bill)
def scrape_senate_bills(self, chamber, insert, session, year): doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution', 8: 'joint resolution'} for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.get(page_path).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = self.get_node( root, '//div[@id="content"]/table/tr[preceding-sibling::tr/td/' 'b[contains(text(), "By:")]]/td/em/text()') bill = Bill(session, chamber, bill_id, title, type=bill_type) bill['subjects'] = list(set(self.subject_mapping[bill_id])) for table in root.xpath('//div[@id="content"]/table'): if 'Bill Text' in table.text_content(): bill_text = table.xpath("string(tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url, mimetype='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "upper") self.scrape_votes(page, page_path, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_type, bill_url): with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) # split "SB1 SD2 HD2" to get SB1 bill_id = page.xpath('//a[@id="LinkButtonMeasure"]')[0].text_content().split()[0] title = page.xpath('//span[@id="ListView1_ctrl0_measure_titleLabel"]')[0].text subjects = page.xpath('//span[@id="ListView1_ctrl0_report_titleLabel"]')[0].text.split('; ') subjects = [s.strip() for s in subjects if s.strip()] description = page.xpath('//span[@id="ListView1_ctrl0_descriptionLabel"]')[0].text sponsors = page.xpath('//span[@id="ListView1_ctrl0_introducerLabel"]')[0].text referral = page.xpath('//span[contains(@id, "referral")]/text()')[0] bill = Bill(session, chamber, bill_id, title, subjects=subjects, type=bill_type, description=description, referral=referral) for sponsor in sponsors.split(', '): if sponsor.endswith(' (BR)'): sponsor = sponsor[:-5] bill.add_sponsor('primary', sponsor) # actions actions = [] table = page.xpath('//table[@id="GridViewStatus"]')[0] for row in table.xpath('tr'): action_params = {} cells = row.xpath('td') if len(cells) == 3: ch = cells[1].xpath('font')[0].text action_params['actor'] = house[ch] action_params['action'] = cells[2].xpath('font')[0].text action_date = cells[0].xpath('font')[0].text action_params['date'] = datetime.strptime(action_date, "%m/%d/%Y") action_params['type'] = categorize_action(action_params['action']) actions.append(action_params) for action_params in actions: bill.add_action(**action_params) self.parse_vote(bill, action_params['action'], action_params['actor'], action_params['date']) # add versions try: for version in page.xpath('//a[contains(@id, "StatusLink")]'): bill.add_version(version.text.replace('_', ' '), version.get('href')) except IndexError: # href not found. pass bill.add_source(bill_url) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, short_title, url): if bill_id in ['SCR 0003', 'SB 0251', 'SB 0292']: return with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath("//br")[8].tail if not title: title = short_title title = title.strip() abbrev = bill_id.split()[0] if abbrev.endswith('B'): bill_type = ['bill'] elif abbrev.endswith('JR'): bill_type = ['joint resolution'] elif abbrev.endswith('CR'): bill_type = ['concurrent resolution'] elif abbrev.endswith('R'): bill_type = ['resolution'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) action_link = page.xpath("//a[contains(@href, 'getActions')]")[0] self.scrape_actions(bill, action_link.attrib['href']) version_path = "//a[contains(., '%s')]" for version_type in ('Introduced Bill', 'House Bill', 'Senate Bill', 'Engrossed Bill', 'Enrolled Act'): path = version_path % version_type links = page.xpath(path) if links: bill.add_version(version_type, links[0].attrib['href']) for vote_link in page.xpath("//a[contains(@href, 'Srollcal')]"): self.scrape_senate_vote(bill, vote_link.attrib['href']) for vote_link in page.xpath("//a[contains(@href, 'Hrollcal')]"): self.scrape_house_vote(bill, vote_link.attrib['href']) for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"): num = doc_link.text.strip().split("(")[0] bill.add_document("Fiscal Impact Statement #%s" % num, doc_link.attrib['href']) bill['subjects'] = self.subjects[bill_id] self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id): biennium = "%s-%s" % (session[0:4], session[7:9]) bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, biennium, bill_num)) page = self.urlopen(url) page = lxml.etree.fromstring(page.bytes) page = xpath(page, "//wa:Legislation")[0] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(session, chamber, bill_id, title, type=[bill_type]) fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % (bill_num, session[0:4])) bill.add_source(fake_source) chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber] mimetype = 'text/html' version_url = ("http://www.leg.wa.gov/pub/billinfo/%s/" "Htm/Bills/%s %ss/%s.htm" % (biennium, chamber_name, bill_type.title(), bill_num)) # Sometimes the measure's version_url isn't guessable. When that happens # have to get the url from the source page. version_resp = self.get(version_url) if version_resp.status_code != 200: webpage = self.get(fake_source).text webdoc = lxml.html.fromstring(webpage) version_url = webdoc.xpath( '//a[contains(@href, "billdocs")]/@href')[-1] if version_url.lower().endswith('.pdf'): mimetype = 'application/pdf' bill.add_version(bill_id, version_url, mimetype=mimetype) self.scrape_sponsors(bill) self.scrape_actions(bill, bill_num) self.scrape_votes(bill) self.fix_prefiled_action_dates(bill) return bill
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id( action['organization_id'])['classification'] bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification'])) # TODO: related entities for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor( sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date'])) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date'])) for title in data['other_titles']: bill.add_title(title) # TODO: related bills # for related in data['related_bills']: self.save_bill(bill)
def scrape1995(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm" with self.lxml_context(url) as page: # Bill name = page.cssselect('h3 br')[0].tail.split('-', 1)[1].strip() bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/'), mimetype='text/html') # Sponsorships rows = page.cssselect('center table tr') for row in rows: if row.text_content().strip() == 'Sponsor and CoSponsors': continue if row.text_content().strip() == 'Links / Committees / Status': break for a in row.cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions # The actions are in a pre table that looks like: """ SENATE HOUSE ------------------------------------- 1/13/95 Read 1st time 2/6/95 1/31/95 Favorably Reported 2/1/95 Read 2nd Time 2/7/95 2/3/95 Read 3rd Time 2/3/95 Passed/Adopted """ actions = page.cssselect('pre')[0].text_content().split('\n') actions = actions[2:] for action in actions: senate_date = action[:22].strip() action_text = action[23:46].strip() house_date = action[46:].strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.save_bill(bill)
def scrape_senate_bills(self, chamber, insert, session, year): doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution', 8: 'joint resolution'} for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) with self.urlopen(page_path) as page: page = page.decode("utf8").replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)') bill = Bill(session, chamber, bill_id, title, type=bill_type) bill['subjects'] = self.subject_mapping[bill_id] bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url) primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "upper") self.scrape_votes(page, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': bill_no = 1 abbr = 'SB' else: bill_no = 4001 abbr = 'HB' while True: bill_page = self.scrape_bill(session, abbr, bill_no) bill_page = BeautifulSoup(bill_page) # if we can't find a page, we must be done. This is a healthy thing. if bill_page == None: return title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0])) title = title.replace('\n','').replace('\r','') bill_id = "%s %d" % (abbr, bill_no) the_bill = Bill(session, chamber, bill_id, title) #sponsors first = 0 for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'): the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string) first = 1 #versions for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_version(*r) #documents if 'frg_billstatus_HlaTable' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) if 'frg_billstatus_SfaSection' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0]) self.save_bill(the_bill) bill_no = bill_no + 1 pass
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt" page = self.get(url).text page = unicode_csv_reader(StringIO.StringIO(page), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial resolution', 'CMR': 'concurrent memorial resolution'}[type_spec] if row[-1] != self.slug: continue bill = Bill(session, chamber, bill_id, row[3], type=bill_type) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsor('primary', primary) # ftp://www.arkleg.state.ar.us/Bills/ # TODO: Keep on eye on this post 2017 to see if they apply R going forward. session_code = '2017R' if session == '2017' else session version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % ( session_code, bill_id.replace(' ', ''))) bill.add_version(bill_id, version_url, mimetype='application/pdf') self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape_bill(self, chamber, session, bill_id): biennium = "%s-%s" % (session[0:4], session[7:9]) bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, biennium, bill_num)) page = self.urlopen(url) page = lxml.etree.fromstring(page.bytes) page = xpath(page, "//wa:Legislation")[0] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(session, chamber, bill_id, title, type=[bill_type]) chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber] version_url = ("http://www.leg.wa.gov/pub/billinfo/%s/" "Htm/Bills/%s %ss/%s.htm" % (biennium, chamber_name, bill_type.title(), bill_num)) bill.add_version(bill_id, version_url, mimetype='text/html') fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % ( bill_num, session[0:4])) bill.add_source(fake_source) self.scrape_sponsors(bill) self.scrape_actions(bill, bill_num) self.scrape_votes(bill) self.fix_prefiled_action_dates(bill) return bill
def scrape(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billreference/" "BillReference.aspx?type=%s" % (session, chamber_abbrev)) page = self.lxmlize(url) for tr in page.xpath( "//table[contains(@id,'cphContent_gvBills')]//tr")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, title, type=bill_type) self.scrape_digest(bill) # versions for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') + tr.xpath('td[12]//a')): # skip references to other bills if a.text.startswith('See'): continue bill.add_version(a.text, a.get('href'), mimetype='application/pdf') # documents fnote = tr.xpath('td[9]//a') if fnote: bill.add_document('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[14]//a') if summary: bill.add_document('Summary', summary[0].get('href')) bill.add_source(url) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) header = page.xpath('//h3/br')[0].tail.replace(' ', ' ') title, primary_sponsor = header.split(' -- ') if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'): bill_type = ['bill'] elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'): bill_type = ['resolution'] elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'): bill_type = ['concurrent resolution'] elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'): bill_type = ['joint resolution'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_sponsor('primary', primary_sponsor) bill.add_source(url) for link in page.xpath( '//a[contains(@href, "bills/") and text() = "HTML"]'): name = link.getprevious().tail.strip() bill.add_version(name, link.attrib['href']) for link in page.xpath( "//a[contains(@href, 'fnotes') and text() = 'HTML']"): bill.add_document("Fiscal Note", link.attrib['href']) subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill['subjects'] = subjects status_link = page.xpath('//a[contains(@href, "billsta")]')[0] self.parse_status(bill, status_link.attrib['href']) self.save_bill(bill)
def scrape(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billindex/" "BillCrossRef.aspx?type=%s" % (session, chamber_abbrev)) page = lxml.html.fromstring(self.urlopen(url)) for tr in page.xpath("//tr[@valign='middle']")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, title, type=bill_type) self.scrape_digest(bill) # versions for a in (tr.xpath('td[6]//a') + tr.xpath('td[9]//a') + tr.xpath('td[10]//a')): # skip references to other bills if a.text.startswith('See'): continue bill.add_version(a.text, a.get('href'), mimetype='application/pdf') # documents fnote = tr.xpath('td[7]//a') if fnote: bill.add_document('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[12]//a') if summary: bill.add_document('Summary', summary[0].get('href')) bill.add_source(url) self.save_bill(bill)
def parse_bill(scraper, url): """Given a bill status URL, return a fully loaded Bill object, except for votes, which are expected to be handled externally. """ session = extract_session(url) chamber = chamber_for_doctype(extract_doctype(url)) s = get_soup(scraper, url) bill_id = extract_bill_id(s) landmark = s(text=re.compile(".*Short Description.*")) name_span = landmark[0].findParent().findNextSibling() bill_name = get_text(name_span) bill = Bill(session, chamber, bill_id, bill_name.strip(),status_url=url) actions = extract_actions(s) for chamber,action,date in actions: bill.add_action(chamber,action,date) #kwargs are permitted if we have 'em. sponsor_dict = extract_sponsors_from_actions([action[1] for action in actions]) for type,namelist in sponsor_dict.iteritems(): for name in namelist: bill.add_sponsor(type,name) for name,link in extract_versions(scraper, s): bill.add_version(name,link) return bill
def scrape1997(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1997_98/leg/sum/sb1.htm" with self.lxml_context(url) as page: # Grab the interesting tables on the page. tables = [] for table in page.cssselect('center table'): if table.get('border') == '5': tables.append(table) # Bill name = page.cssselect( 'tr > td > font > b')[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/'), mimetype='text/html') # Sponsorships for a in tables[0].cssselect('a'): if a.text_content().strip() == 'Current': break bill.add_sponsor('', a.text_content().strip()) # Actions for row in tables[1].cssselect('tr'): senate_date = row[0].text_content().strip() action_text = row[1].text_content().strip() house_date = row[2].text_content().strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.save_bill(bill)
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt" page = self.urlopen(url) page = unicode_csv_reader(StringIO.StringIO(page), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial resolution', 'CMR': 'concurrent memorial resolution' }[type_spec] if row[-1] != self.slug: continue bill = Bill(session, chamber, bill_id, row[3], type=bill_type) bill.add_source(url) bill.add_sponsor('primary', row[11]) version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % (session, bill_id.replace(' ', ''))) bill.add_version(bill_id, version_url) self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape2001(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2001_02/sum/sb1.htm" with self.lxml_context(url) as page: # Grab the interesting tables on the page. tables = page.cssselect('table center table') # Bill name = tables[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Sponsorships for a in tables[1].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions center = page.cssselect('table center')[-1] for row in center.cssselect('table table')[0].cssselect('tr')[2:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue if action_text.startswith('Senate'): action_text = action_text.split(' ', 1)[1].strip() bill.add_action('upper', action_text, date) elif action_text.startswith('House'): action_text = action_text.split(' ', 1)[1].strip() bill.add_action('lower', action_text, date) # Versions for row in center.cssselect('table table')[1].cssselect('a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href')), mimetype='text/html') self.save_bill(bill)
def scrape_bill(self, url, kw, re_amendment=re.compile(r'(^[A-Z]A \d{1,3}) to'), re_substitution=re.compile(r'(^[A-Z]S \d{1,2}) for'), re_digits=re.compile(r'\d{,5}'), actions_categorize=actions.categorize, actions_get_actor=actions.get_actor): bill = Bill(**kw) bill.add_source(url) #--------------------------------------------------------------------- # A few helpers. _url_2_lxml = self._url_2_lxml _cleanup_sponsors = self._cleanup_sponsors # Shortcut function partial to get text at a particular xpath: doc = _url_2_lxml(url) _get_text = partial(get_text, doc, 0) # Get session number--needed for fetching related documents (see below). xpath = '//font[contains(., "General Assembly") and @face="Arial"]' session_num = doc.xpath(xpath)[0].text_content() session_num = re_digits.match(session_num).group() #--------------------------------------------------------------------- # Sponsors chamber = bill['chamber'] sponsor_types = { 'Additional Sponsor(s):': 'cosponsor', 'CoSponsors:': 'cosponsor', 'Primary Sponsor:': 'primary' } xpath = '//font[contains(., "Sponsor") and @color="#008080"]' headings = doc.xpath(xpath + '/text()') sponsors = doc.xpath(xpath + '/../../following-sibling::td/font/text()') for h, s in zip(headings, sponsors): names = _cleanup_sponsors(s, chamber) type_ = sponsor_types[h.strip()] if names: for name, _chamber in names: bill.add_sponsor(type_, name, chamber=_chamber) #--------------------------------------------------------------------- # Versions tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/vwLegislation', '{moniker}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=url, docname="introduced", filename="Legis", tmp=tmp, session_num=session_num) for d in documents: bill.add_version(**d) # If bill is a substitution, add the original as a version. names = doc.xpath('//*[contains(text(), "Substituted ' 'Legislation for Bill:")]/text()') urls = doc.xpath('//*[contains(text(), "Substituted ' 'Legislation for Bill:")]' '/following-sibling::a/@href') for name, url in zip(names, urls): name = re_substitution.match(name).group(1) bill.add_version(name, url, description='original bill') #--------------------------------------------------------------------- # Actions actions = doc.xpath('//font[contains(., "Actions History")]' '/../following-sibling::table/descendant::td[2]') actions = actions[0].text_content() actions = filter(None, actions.splitlines()) for a in reversed(actions): date, action = a.split(' - ', 1) try: date = datetime.strptime(date, '%b %d, %Y') except ValueError: date = datetime.strptime(date, '%B %d, %Y') # XXX: ugh. actor = actions_get_actor(action, bill['chamber']) type_ = actions_categorize(action) bill.add_action(actor, action, date, type_) #--------------------------------------------------------------------- # Votes vote_strings = doc.xpath('//*[contains(text(), "vote:")]/text()') # Sometimes vote strings are contained in weird, separate elements. Probably # hand edited. if not all(re.search('\d', string) for string in vote_strings): # Use the parent's text_content instead. vote_strings = [] for el in doc.xpath('//*[contains(text(), "vote:")]/..'): vote_strings.append(el.text_content()) vote_urls = doc.xpath('//*[contains(text(), "vote:")]' '/following-sibling::a/@href') for string, url in zip(vote_strings, vote_urls): vote_data = parse_votestring(string) vote = self.scrape_vote(url, **vote_data) if vote: bill.add_vote(vote) #--------------------------------------------------------------------- # Amendments xpath = ("//font[contains(., 'Amendments')]/" "../../../td[2]/font/a") tmp = ('http://www.legis.delaware.gov/LIS/lis{session_num}.nsf/' 'vwLegislation/{id_}/$file/{filename}{format_}?open') for source, id_ in zip(doc.xpath(xpath + '/@href'), doc.xpath(xpath + '/text()')): short_id = re_amendment.match(id_).group(1) documents = self.scrape_documents(source=source, docname='amendment (%s)' % short_id, filename='Legis', tmp=tmp, session_num=session_num, id_=id_) for d in documents: bill.add_document(**d) #--------------------------------------------------------------------- # Add any related "Engrossments". # See www.ncsl.org/documents/legismgt/ILP/98Tab3Pt4.pdf for # an explanation of the engrossment process in DE. source = doc.xpath('//img[@alt="Engrossment"]/../@href') if source: tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/EngrossmentsforLookup', '{moniker}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=source[0], docname="Engrossment", filename="Engross", tmp=tmp, session_num=session_num, id_=bill['bill_id']) for d in documents: bill.add_version(**d) # -------------------------------------------------------------------- # Add any fiscal notes. source = doc.xpath("//img[@alt='Fiscal Note']/../@href") if source: tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/FiscalforLookup', '{docnum}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=source[0], docname="Fiscal Note", filename="Fiscal", tmp=tmp, session_num=session_num) for d in documents: bill.add_document(**d) #--------------------------------------------------------------------- # Extra fields # Helper to get the first td sibling of certain nodes. tmp = '//font[contains(., "%s")]/../../../td[2]' first_sibling_text = lambda heading: _get_text(tmp % heading) extra_fields = { # A long description of the legislation. "summary": "Synopsis", # Codification details for enacted legislation. "volume_chapter": "Volume Chapter", # Presumably the date of approval/veto. "date_governor_acted": "Date Governor Acted", "fiscal_notes": "Fiscal Notes", } for key, name in extra_fields.iteritems(): try: bill[key] = first_sibling_text(name) except IndexError: # xpath lookup failed. pass self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type, url): doc = lxml.html.fromstring(self.get(url).text) doc.make_links_absolute(url) title = doc.xpath('//b[text()="TITLE:"]') if title: title = title[0].tail.strip().strip('"') else: self.warning("skipping bill %s, no information" % url) return bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) # Get sponsors spons_str = doc.xpath( '//b[contains(text(), "SPONSOR")]')[0].tail.strip() sponsors_match = re.match( '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') sponsor = sponsors[0].strip() if sponsor: bill.add_sponsor('primary', sponsors[0]) for sponsor in sponsors[1:]: sponsor = sponsor.strip() if sponsor: bill.add_sponsor('cosponsor', sponsor) else: # Committee sponsorship spons_str = spons_str.strip() if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str): spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '', spons_str).title() spons_str = (spons_str + " Committee (by request of the governor)") if spons_str: bill.add_sponsor('primary', spons_str) # Get actions from second myth table self._current_comm = None act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:] for row in act_rows: date, journal, raw_chamber, action = row.xpath('td') act_date = datetime.datetime.strptime(date.text_content().strip(), '%m/%d/%y') raw_chamber = raw_chamber.text_content().strip() action = action.text_content().strip() if raw_chamber == "(H)": act_chamber = "lower" elif raw_chamber == "(S)": act_chamber = "upper" if re.match("\w+ Y(\d+)", action): vote_href = journal.xpath('.//a/@href') if vote_href: self.parse_vote(bill, action, act_chamber, act_date, vote_href[0]) action, atype = self.clean_action(action) match = re.match('^Prefile released (\d+/\d+/\d+)$', action) if match: action = 'Prefile released' act_date = datetime.datetime.strptime(match.group(1), '%m/%d/%y') bill.add_action(act_chamber, action, act_date, type=atype) # Get subjects bill['subjects'] = [] for subj in doc.xpath('//a[contains(@href, "subject")]/text()'): bill['subjects'].append(subj.strip()) # Get versions text_list_url = "http://www.legis.state.ak.us/"\ "basis/get_fulltext.asp?session=%s&bill=%s" % ( session, bill_id) bill.add_source(text_list_url) text_doc = lxml.html.fromstring(self.get(text_list_url).text) text_doc.make_links_absolute(text_list_url) for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'): name = link.xpath('../preceding-sibling::td/text()')[0].strip() text_url = link.get('href') bill.add_version(name, text_url, mimetype="text/html") # Get documents doc_list_url = "http://www.legis.state.ak.us/"\ "basis/get_documents.asp?session=%s&bill=%s" % ( session, bill_id ) doc_list = lxml.html.fromstring(self.get(doc_list_url).text) doc_list.make_links_absolute(doc_list_url) bill.add_source(doc_list_url) for href in doc_list.xpath( '//a[contains(@href, "get_documents")][@onclick]'): h_name = href.text_content() h_href = href.attrib['href'] if h_name.strip(): bill.add_document(h_name, h_href) self.save_bill(bill)
def scrape_bill_pages(self, session, year_abr): """ assemble information on a bill from a number of DBF files """ #Main Bill information main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) bill.add_source(main_bill_url) bill_dict[bill_id] = bill #Sponsors bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON') for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == 'P': sponsor_type = "Primary" else: sponsor_type = "Co-sponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP') #print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] document = rec["document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype try: doc_name = self._doctypes[rec['doctype']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['doctype'], bill_id)) if rec['comment']: doc_name += ' ' + rec['comment'] if rec['doctype'] in self._version_types: bill.add_version(doc_name, htm_url) else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr)+1 vote_info_list = ['A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename s_vote_zip, resp = self.urlretrieve(s_vote_url) zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % filename vote_file = zipedfile.open(vfile, 'U') vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == 'committee': votes[vote_id]['committee'] = self._committees[ rec['Committee_House']] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST') for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = rec["house"] comment = rec["comment"] action, atype = self.categorize_action(action) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ') for rec in subject_db: bill_id = rec['billtype'] + str(int(rec['billnumber'])) bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['subjectkey']) else: self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id) # save all bills at the end for bill in bill_dict.itervalues(): # add sources bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) bill.add_source(subject_url) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath( u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()' ) if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath( u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(','): bill.add_sponsor('primary', aname.strip()) co_authors = doc.xpath( u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(','): bill.add_sponsor('cosponsor', co_author.strip()) action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() #parse the text to see if it's a new version or a unrelated document #if has - let's *shrug* assume it's a vote document #get url of action action_url = tds[1].xpath('a/@href') #check it has a url and is not just text if action_url: action_url = action_url[0] #check if it's a version of the bill or another type of document. #NOTE: not sure if new versions of the bill are only denoted with 'Entirillado' OR if that's the correct name but from what i gather it looks like it. if re.match('Entirillado', action): bill.add_version(action, action_url) else: bill.add_document(action, action_url) for pattern, atype in _classifiers: if re.match(pattern, action): break else: atype = 'other' bill.add_action(chamber, action, date, type=atype) if atype == 'bill:passed' and action_url: vote_chamber = None for pattern, vote_chamber in _voteChambers: if re.match(pattern, action): break else: self.warning('coudnt find voteChamber pattern') if vote_chamber == 'lower' and len(action_url) > 0: vote = self.scrape_votes(action_url, action, date, vote_chamber) if not vote[0] == None: vote[0].add_source(action_url) bill.add_vote(vote[0]) else: self.warning('Problem Reading vote: %s,%s' % (vote[1], bill_id)) bill.add_source(url) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, title, url, strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) bill_type = self.bill_types[bill_id.split()[0][1:]] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) xpath = ('//strong[contains(., "SUBJECT")]/../' 'following-sibling::td/a/text()') bill['subjects'] = page.xpath(xpath) for version in self.scrape_versions(session, chamber, page, bill_id): bill.add_version(**version) # Resolution pages have different html. values = {} trs = page.xpath('//div[@id="bhistcontent"]/table/tr') for tr in trs: heading = tr.xpath('td/strong/text()') if heading: heading = heading[0] else: continue value = tr.text_content().replace(heading, '').strip() values[heading] = value # summary was always same as title #bill['summary'] = values['SUMMARY:'] # Add primary sponsor. primary = strip_sponsors('', values.get('LEAD SPONSOR:', '')) if primary: bill.add_sponsor('primary', primary) # Add cosponsors. if values.get('SPONSORS:'): sponsors = strip_sponsors('', values['SPONSORS:']) sponsors = re.split(', (?![A-Z]\.)', sponsors) for name in sponsors: name = name.strip(', \n\r') if name: # Fix name splitting bug where "Neale, D. Hall" match = re.search('(.+?), ([DM]\. Hall)', name) if match: for name in match.groups(): bill.add_sponsor('cosponsor', name) else: bill.add_sponsor('cosponsor', name) for link in page.xpath("//a[contains(@href, 'votes/house')]"): self.scrape_house_vote(bill, link.attrib['href']) for tr in reversed( page.xpath("//table[@class='tabborder']/descendant::tr")[1:]): tds = tr.xpath('td') if len(tds) < 3: continue chamber_letter = tds[0].text_content() chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter] # Index of date info no longer varies on resolutions. date = tds[2].text_content().strip() date = datetime.datetime.strptime(date, "%m/%d/%y").date() action = tds[1].text_content().strip() if action.lower().startswith('passed senate'): for href in tds[1].xpath('a/@href'): self.scrape_senate_vote(bill, href, date) attrs = dict(actor=chamber, action=action, date=date) attrs.update(self.categorizer.categorize(action)) bill.add_action(**attrs) self.save_bill(bill)
def scrape_bill(self, bill_url, chamber, session): with self.urlopen(bill_url) as text: if "Specified Bill could not be found" in text: return False page = lxml.html.fromstring(text) page.make_links_absolute(bill_url) bill_id = page.xpath("string(//h2)").split()[0] summary = page.xpath( "string(//*[starts-with(text(), 'Summary: ')])") summary = summary.replace('Summary: ', '') match = re.match( r"^([^:]+): " r"((\(Constitutional [aA]mendment\) )?[^(]+)", summary) if match: subjects = [match.group(1).strip()] title = match.group(2).strip() else: raise ScrapeError("Bad title") if bill_id.startswith('SB') or bill_id.startswith('HB'): bill_type = ['bill'] elif bill_id.startswith('SR') or bill_id.startswith('HR'): bill_type = ['resolution'] elif bill_id.startswith('SCR') or bill_id.startswith('HCR'): bill_type = ['concurrent resolution'] else: raise ScrapeError("Invalid bill ID format: %s" % bill_id) if title.startswith("(Constitutional Amendment)"): bill_type.append('constitutional amendment') title = title.replace('(Constitutional Amendment) ', '') bill = Bill(session, chamber, bill_id, title, subjects=subjects, type=bill_type) bill.add_source(bill_url) history_link = page.xpath("//a[text() = 'History']")[0] history_url = history_link.attrib['href'] self.scrape_history(bill, history_url) authors_link = page.xpath("//a[text() = 'Authors']")[0] authors_url = authors_link.attrib['href'] self.scrape_authors(bill, authors_url) try: versions_link = page.xpath( "//a[text() = 'Text - All Versions']")[0] versions_url = versions_link.attrib['href'] self.scrape_versions(bill, versions_url) for doc in ["Notes", "Digest", "Amendments", "Misc"]: doc_link = page.xpath("//a[text() = '%s']" % doc)[0] doc_url = doc_link.attrib['href'] self.scrape_documents(bill, doc_url) except IndexError: # Only current version try: version_link = page.xpath( "//a[text() = 'Text - Current']")[0] version_url = version_link.attrib['href'] bill.add_version("%s Current" % bill_id, version_url, on_duplicate="use_old") except IndexError: # Some bills don't have any versions :( pass try: votes_link = page.xpath("//a[text() = 'Votes']")[0] self.scrape_votes(bill, votes_link.attrib['href']) except IndexError: # Some bills don't have any votes pass self.save_bill(bill) return True
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] if chamber == 'legislature': chamber = 'upper' bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id( action['organization_id'])['classification'] legislators = [] committees = [] for rel in action['related_entities']: if rel['entity_type'] == 'organization': committees.append(rel['name']) elif rel['entity_type'] == 'person': legislators.append(rel['name']) bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification']), committees=committees, legislators=legislators, **action.get('extras', {})) for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor( sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date']), **version.get('extras', {})) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date']), **doc.get('extras', {})) for title in data['other_titles']: bill.add_title(title['title']) for related in data['related_bills']: bill.add_companion(related['identifier'], related['legislative_session'], chamber) bill['alternate_bill_ids'] = [ oi['identifier'] for oi in data['other_identifiers'] ] self.save_bill(bill)
def parse_bill(self, chamber, session, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) try: short_bill_id = re.sub(r'S([JC])R', r'S\1', bill_id) version_link = page.xpath( "//a[contains(@href, '%s/bill.doc')]" % short_bill_id)[0] except IndexError: # Bill withdrawn return pars = version_link.xpath("following-sibling::p") if len(pars) == 2: title = pars[0].xpath("string()") action_p = pars[1] else: title = pars[0].getprevious().tail action_p = pars[0] title = re.sub(ur'[\s\xa0]+', ' ', title).strip() if 'CR' in bill_id: bill_type = 'concurrent resolution' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, title, type=bill_type) bill['subjects'] = self._subjects[bill_id] bill.add_source(url) bill.add_version("Most Recent Version", version_link.attrib['href']) for link in page.xpath("//a[contains(@href, 'legislator/')]"): bill.add_sponsor('primary', link.text.strip()) for line in action_p.xpath("string()").split("\n"): action = line.strip() if (not action or action == 'last action' or 'Prefiled' in action): continue action_date = "%s %s" % (action.split('-')[0], session[0:4]) action_date = datetime.datetime.strptime( action_date, '%b %d %Y') action = '-'.join(action.split('-')[1:]) if action.endswith('House') or action.endswith('(H)'): actor = 'lower' elif action.endswith('Senate') or action.endswith('(S)'): actor = 'upper' else: actor = chamber atype = [] if action.startswith('introduced in'): atype.append('bill:introduced') if '; to ' in action: atype.append('committee:referred') elif action.startswith('signed by Governor'): atype.append('governor:signed') elif re.match(r'^to [A-Z]', action): atype.append('committee:referred') elif action == 'adopted by voice vote': atype.append('bill:passed') if '1st reading' in action: atype.append('bill:reading:1') if '3rd reading' in action: atype.append('bill:reading:3') if '2nd reading' in action: atype.append('bill:reading:2') if 'R' in bill_id and 'adopted by voice vote' in action: atype.append('bill:passed') amendment_re = (r'floor amendments?( \([a-z\d\-]+\))*' r'( and \([a-z\d\-]+\))? filed') if re.search(amendment_re, action): atype.append('amendment:introduced') if not atype: atype = ['other'] bill.add_action(actor, action, action_date, type=atype) try: votes_link = page.xpath( "//a[contains(@href, 'vote_history.pdf')]")[0] bill.add_document("Vote History", votes_link.attrib['href']) except IndexError: # No votes pass self.save_bill(bill)
def parse_bill_xml(self, chamber, session, txt): root = lxml.etree.fromstring(txt.bytes) bill_id = ' '.join(root.attrib['bill'].split(' ')[1:]) bill_title = root.findtext("caption") if session[2] == 'R': session = session[0:2] if bill_id[1] == 'B': bill_type = ['bill'] elif bill_id[1] == 'R': bill_type = ['resolution'] elif bill_id[1:3] == 'CR': bill_type = ['concurrent resolution'] elif bill_id[1:3] == 'JR': bill_type = ['joint resolution'] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) versions = root.xpath("//versions") for version in versions: versionz = version.xpath(".//version") for v in versionz: description = v.xpath(".//versionDescription")[0].text html_url = v.xpath(".//WebHTMLURL")[0].text bill.add_version(description, html_url, 'text/html') for action in root.findall('actions/action'): act_date = datetime.datetime.strptime(action.findtext('date'), "%m/%d/%Y").date() extra = {} extra['action_number'] = action.find('actionNumber').text comment = action.find('comment') if comment is not None and comment.text: extra['comment'] = comment.text.strip() actor = { 'H': 'lower', 'S': 'upper', 'E': 'executive' }[extra['action_number'][0]] desc = action.findtext('description').strip() if desc == 'Scheduled for public hearing on . . .': continue introduced = False if desc == 'Amended': atype = 'amendment:passed' elif desc == 'Amendment(s) offered': atype = 'amendment:introduced' elif desc == 'Amendment amended': atype = 'amendment:amended' elif desc == 'Amendment withdrawn': atype = 'amendment:withdrawn' elif desc == 'Passed' or desc == 'Adopted': atype = 'bill:passed' elif re.match(r'^Received (by|from) the', desc): if 'Secretary of the Senate' not in desc: atype = 'bill:introduced' else: atype = 'bill:filed' elif desc.startswith('Sent to the Governor'): # But what if it gets lost in the mail? atype = 'governor:received' elif desc.startswith('Signed by the Governor'): atype = 'governor:signed' elif desc == 'Vetoed by the Governor': atype = 'governor:vetoed' elif desc == 'Read first time': atype = ['bill:introduced', 'bill:reading:1'] introduced = True elif desc == 'Read & adopted': atype = ['bill:passed'] if not introduced: introduced = True atype.append('bill:introduced') elif desc == "Passed as amended": atype = 'bill:passed' elif desc.startswith('Referred to') or desc.startswith( "Recommended to be sent to "): atype = 'committee:referred' elif desc == "Reported favorably w/o amendment(s)": atype = 'committee:passed' elif desc == "Filed": atype = 'bill:filed' elif desc == 'Read 3rd time': atype = 'bill:reading:3' elif desc == 'Read 2nd time': atype = 'bill:reading:2' elif desc.startswith('Reported favorably'): atype = 'committee:passed:favorable' else: atype = 'other' if 'committee:referred' in atype: repls = ['Referred to', "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() extra['committee'] = ctty bill.add_action(actor, action.findtext('description'), act_date, type=atype, **extra) for author in root.findtext('authors').split(' | '): if author != "": bill.add_sponsor('author', author) for coauthor in root.findtext('coauthors').split(' | '): if coauthor != "": bill.add_sponsor('coauthor', coauthor) for sponsor in root.findtext('sponsors').split(' | '): if sponsor != "": bill.add_sponsor('sponsor', sponsor) for cosponsor in root.findtext('cosponsors').split(' | '): if cosponsor != "": bill.add_sponsor('cosponsor', cosponsor) bill['subjects'] = [] for subject in root.iterfind('subjects/subject'): bill['subjects'].append(subject.text.strip()) return bill
def scrape_bill_sheet(self, session, chamber): """ Scrape the bill sheet (the page full of bills and other small bits of data) """ sheet_url = self.get_bill_folder(session, chamber) bill_chamber = {"Senate": "upper", "House": "lower"}[chamber] index = { "id": 0, "title_sponsor": 1, "version": 2, "history": 3, "votes": 7 } with self.urlopen(sheet_url) as sheet_html: sheet_page = lxml.html.fromstring(sheet_html) bills = sheet_page.xpath('//table/tr') for bill in bills: bill_id = self.read_td(bill[index["id"]][0]) if bill_id == None: # Every other entry is null for some reason continue dot_loc = bill_id.find('.') if dot_loc != -1: # budget bills are missing the .pdf, don't truncate bill_id = bill_id[:dot_loc] title_and_sponsor = bill[index["title_sponsor"]][0] bill_title = title_and_sponsor.text bill_title_and_sponsor = title_and_sponsor.text_content() sponsors = bill_title_and_sponsor.replace(bill_title, "").\ replace(" & ...", "").split("--") cats = { "SB": "bill", "HB": "bill", "HR": "resolution", "SR": "resolution", "SCR": "concurrent resolution", "HCR": "concurrent resolution", "SJR": "joint resolution", "HJR": "joint resolution", "SM": "memorial", "HM": "memorial" } bill_type = None for cat in cats: if bill_id[:len(cat)] == cat: bill_type = cats[cat] b = Bill(session, bill_chamber, bill_id, bill_title, type=bill_type) b.add_source(sheet_url) versions_url = \ bill[index["version"]].xpath('font/a')[0].attrib["href"] versions_url = CO_URL_BASE + versions_url versions = self.parse_versions(versions_url) for version in versions: b.add_version(version['name'], version['link'], mimetype=version['mimetype']) bill_history_href = CO_URL_BASE + \ bill[index["history"]][0][0].attrib['href'] # ^^^^^^^ We assume this is a full path to the target. # might want to consider some better rel-path support # XXX: Look at this ^ history = self.parse_history(bill_history_href) b.add_source(bill_history_href) for action in history: self.add_action_to_bill(b, action) for sponsor in sponsors: if sponsor != None and sponsor != "(NONE)" and \ sponsor != "": b.add_sponsor("primary", sponsor) # Now that we have history, let's see if we can't grab some # votes bill_vote_href = self.get_vote_url(bill_id, session) votes = self.parse_votes(bill_vote_href) if votes['sanity-check'] != bill_id: self.warning("XXX: READ ME! Sanity check failed!") self.warning(" -> Scraped ID: " + votes['sanity-check']) self.warning(" -> 'Real' ID: " + bill_id) assert votes['sanity-check'] == bill_id for vote in votes['votes']: filed_votes = vote['votes'] passage = vote['meta'] result = vote['result'] composite_time = "%s %s" % (passage['x-parent-date'], passage['TIME']) # It's now like: 04/01/2011 02:10:14 PM pydate = dt.datetime.strptime(composite_time, "%m/%d/%Y %I:%M:%S %p") hasHouse = "House" in passage['x-parent-ctty'] hasSenate = "Senate" in passage['x-parent-ctty'] if hasHouse and hasSenate: actor = "joint" elif hasHouse: actor = "lower" else: actor = "upper" other = (int(result['EXC']) + int(result['ABS'])) # OK, sometimes the Other count is wrong. local_other = 0 for voter in filed_votes: l_vote = filed_votes[voter].lower().strip() if l_vote != "yes" and l_vote != "no": local_other = local_other + 1 if local_other != other: self.warning( \ "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES" ) self.warning(" -> Old: %s // New: %s" % (other, local_other)) other = local_other v = Vote(actor, pydate, passage['MOTION'], (result['FINAL_ACTION'] == "PASS"), int(result['YES']), int(result['NO']), other, moved=passage['MOVED'], seconded=passage['SECONDED']) v.add_source(vote['meta']['url']) # v.add_source( bill_vote_href ) # XXX: Add more stuff to kwargs, we have a ton of data for voter in filed_votes: who = voter vote = filed_votes[who] if vote.lower() == "yes": v.yes(who) elif vote.lower() == "no": v.no(who) else: v.other(who) b.add_vote(v) self.save_bill(b)