def process_page(self): chamber = "upper" if self.input.identifier.startswith("S") else "lower" short_title = self.get_column_div("Summary").text long_title = CSS("#title").match_one(self.root).text if "*" in self.input.identifier: stars = re.search(r"\*+", self.input.identifier).group() if ( self.input.session in CARRYOVERS and stars in CARRYOVERS[self.input.session] ): self.input.identifier = re.sub( r"\*+", "-" + CARRYOVERS[self.input.session][stars], self.input.identifier, ) else: self.logger.error( f"Unidentified carryover bill {self.input.identifier}. Update CARRYOVERS dict in bills.py" ) return bill = Bill( identifier=self.input.identifier, legislative_session=self.input.session, title=short_title, chamber=chamber, ) bill.subject = self.input.subjects # use the pretty source URL bill.add_source(self.input.source_url) bill.add_title(long_title) try: sponsors = self.get_column_div("Primary Sponsor") self.add_sponsors(bill, CSS("a").match(sponsors), primary=True) except SelectorError: pass try: cosponsors = self.get_column_div("Co-Sponsor") self.add_sponsors(bill, CSS("a").match(cosponsors), primary=False) except SelectorError: pass # TODO: figure out cosponsor div name, can't find any as of Feb 2021 self.add_actions(bill, chamber) bdr = extract_bdr(short_title) if bdr: bill.extras["BDR"] = bdr text_url = self.source.url.replace("Overview", "Text") yield BillTabText(bill, source=text_url)
def test_whitespace_is_stripped(): s = Scraper(juris, "/tmp/") b = Bill(" HB 11", "2020", " a short title ") b.subject = [" one", "two ", " three "] b.add_source("https://example.com/ ") s.save_object(b) # the simple cases, and nested lists / objects assert b.identifier == "HB 11" assert b.title == "a short title" assert b.sources[0]["url"] == "https://example.com/" assert b.subject == ["one", "two", "three"]
def scrape_bill(self, session, session_slug, chamber, url): page = lxml.html.fromstring(self.get(url).text) bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip() # state bill id internal_id = re.search(r"\/Bill\/(\d+)\/Overview", url).group(1) # bill data gets filled in from another call bill_data_base = ( "https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/" "FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}") bill_data_url = bill_data_base.format(session_slug, internal_id, time.time() * 1000) bill_page = lxml.html.fromstring(self.get(bill_data_url).text) short_title = self.get_header_field(bill_page, "Summary:").text short_title = short_title.replace("\u00a0", " ") bill = Bill( identifier=bill_no, legislative_session=session, title=short_title, chamber=chamber, ) long_title = self.get_header_field(bill_page, "Title:").text if long_title is not None: bill.add_abstract(long_title, "Summary") sponsor_div = self.get_header_field(bill_page, "Primary Sponsor") if sponsor_div is not None: self.add_sponsors(sponsor_div, bill, "primary") cosponsor_div = self.get_header_field(bill_page, "Co-Sponsor") if cosponsor_div is not None: self.add_sponsors(cosponsor_div, bill, "cosponsor") self.add_actions(bill_page, bill, chamber) self.add_versions(session_slug, internal_id, bill) bill.subject = list(set(self.subject_mapping[bill_no])) bdr = self.extract_bdr(short_title) if bdr: bill.extras["BDR"] = bdr bill.extras["NV_ID"] = internal_id bill.add_source(url) yield bill
def handle_page(self): bills = self.doc.xpath('//ul[@class="linkSect"]/li') for bill in bills: link = bill.getchildren()[0] bill_id = str(link.text_content()) if not bill_id.startswith(("S", "H")): continue # create a bill desc = bill.xpath("text()")[0].strip() chamber = {"H": "lower", "S": "upper"}[bill_id[0]] bill_type = { "B": "bill", "J": "joint resolution", "R": "resolution" }[bill_id[1]] bill = Bill( bill_id, self.kwargs["session"], desc, chamber=chamber, classification=bill_type, ) bill_url = link.get("href") sponsor_url = BASE_URL + URL_PATTERNS["sponsors"].format( self.kwargs["session_id"], bill_id.replace(" ", "")) list( self.scrape_page_items(BillSponsorPage, url=sponsor_url, obj=bill)) yield from self.scrape_page_items(BillDetailPage, url=bill_url, obj=bill) bill.subject = self.kwargs["subjects"][bill_id] bill.add_source(bill_url) yield bill next_url = self.doc.xpath('//a/b[text()="More..."]/../@href') if next_url: yield from self.scrape_page_items(BillListPage, url=next_url[0], **self.kwargs)
def process_item(self, item): bill_id = item.text.strip() title = item.xpath("string(../following-sibling::td[1])").strip() sponsor = item.xpath("string(../following-sibling::td[2])").strip() bill_url = item.attrib["href"] + "/ByCategory" if bill_id.startswith(("SB ", "HB ", "SPB ", "HPB ")): bill_type = "bill" elif bill_id.startswith(("HR ", "SR ")): bill_type = "resolution" elif bill_id.startswith(("HJR ", "SJR ")): bill_type = "joint resolution" elif bill_id.startswith(("SCR ", "HCR ")): bill_type = "concurrent resolution" elif bill_id.startswith(("SM ", "HM ")): bill_type = "memorial" else: raise ValueError("Failed to identify bill type.") bill = Bill( bill_id, self.input["session"], title, chamber="lower" if bill_id[0] == "H" else "upper", classification=bill_type, ) bill.add_source(bill_url) # normalize id from HB 0004 to H4 subj_bill_id = re.sub(r"(H|S)\w+ 0*(\d+)", r"\1\2", bill_id) bill.subject = list(self.subjects[subj_bill_id]) sponsor = re.sub(r"^(?:Rep|Sen)\.\s", "", sponsor) sponsor = re.sub(r",\s+(Jr|Sr)\.", r" \1.", sponsor) for sp in sponsor.split(", "): sp = sp.strip() sp_type = "organization" if "committee" in sp.lower() else "person" bill.add_sponsorship(sp, "primary", sp_type, True) return BillDetail(bill)
def handle_list_item(self, item): bill_id = item.text.strip() title = item.xpath("string(../following-sibling::td[1])").strip() sponsor = item.xpath("string(../following-sibling::td[2])").strip() bill_url = item.attrib["href"] + "/ByCategory" if bill_id.startswith(("SB ", "HB ", "SPB ", "HPB ")): bill_type = "bill" elif bill_id.startswith(("HR ", "SR ")): bill_type = "resolution" elif bill_id.startswith(("HJR ", "SJR ")): bill_type = "joint resolution" elif bill_id.startswith(("SCR ", "HCR ")): bill_type = "concurrent resolution" elif bill_id.startswith(("SM ", "HM ")): bill_type = "memorial" else: raise ValueError("Failed to identify bill type.") bill = Bill( bill_id, self.kwargs["session"], title, chamber="lower" if bill_id[0] == "H" else "upper", classification=bill_type, ) bill.add_source(bill_url) # normalize id from HB 0004 to H4 subj_bill_id = re.sub(r"(H|S)\w+ 0*(\d+)", r"\1\2", bill_id) bill.subject = list(self.kwargs["subjects"][subj_bill_id]) sponsor = re.sub(r"^(?:Rep|Sen)\.\s", "", sponsor) for sp in sponsor.split(", "): sp = sp.strip() bill.add_sponsorship(sp, "primary", "person", True) yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill) yield bill
def scrape_bill_list(self, chamber, session, url): if "joint_resolution" in url: bill_type = "joint resolution" elif "resolution" in url: bill_type = "resolution" elif "bill" in url: bill_type = "bill" try: data = self.get(url).text except scrapelib.HTTPError: self.warning("skipping URL %s" % url) return doc = lxml.html.fromstring(data) doc.make_links_absolute(url) bill_list = doc.xpath('//ul[@class="infoLinks"]/li/div[@class="row-fluid"]') for b in bill_list: bill_url = b.xpath('./div[@class="span3"]/a/@href')[0] bill_id = bill_url.rsplit("/", 1)[-1] bill_id = bill_id.upper() title = ( b.xpath('./div[@class="span6"]/text()')[0] .replace(" - Relating to: ", "") .strip() ) bill = Bill( bill_id, legislative_session=session, title=title, chamber=chamber, classification=bill_type, ) bill.subject = list(set(self.subjects[bill_id])) yield from self.scrape_bill_history(bill, bill_url, chamber) yield bill
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath( '//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath( '//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bid == "XXXXXX": self.info("Skipping Junk Bill") return bill = Bill( bill_id, title=bill_desc, chamber="upper", legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note="abstract") bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor try: sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] except IndexError: sponsor = bill_page.xpath('//span[@id="lSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship(bill_sponsor, entity_type="person", classification="primary", primary=True) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get("href"): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib["href"]) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib["href"] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get("href"): self._parse_senate_bill_versions(bill, versions_url[0].attrib["href"]) amendment_links = bill_page.xpath( '//a[contains(@href,"ShowAmendment.asp")]') for link in amendment_links: link_text = link.xpath("string(.)").strip() if "adopted" in link_text.lower(): link_url = link.xpath("@href")[0] bill.add_version_link( link_text, link_url, media_type="application/pdf", on_duplicate="ignore", ) yield bill
def scrape_bills(self, chamber, session, subjects): idex = bill_start_numbers(session)[chamber] FROM = "ctl00$rilinContent$txtBillFrom" TO = "ctl00$rilinContent$txtBillTo" YEAR = "ctl00$rilinContent$cbYear" blocks = "FOO" # Ugh. while len(blocks) > 0: default_headers = get_default_headers(SEARCH_URL) default_headers[FROM] = idex default_headers[TO] = idex + MAXQUERY default_headers[YEAR] = session idex += MAXQUERY blocks = self.parse_results_page( self.post(SEARCH_URL, data=default_headers).text) blocks = blocks[1:-1] blocks = self.digest_results_page(blocks) for block in blocks: bill = blocks[block] subs = [] try: subs = subjects[bill["bill_id"]] except KeyError: pass title = bill["title"][len("ENTITLED, "):] billid = bill["bill_id"] try: subs = subjects[bill["bill_id"]] except KeyError: subs = [] for b in BILL_NAME_TRANSLATIONS: if billid[:len(b)] == b: billid = (BILL_NAME_TRANSLATIONS[b] + billid[len(b) + 1:].split()[0]) b = Bill( billid, title=title, chamber=chamber, legislative_session=session, classification=self.get_type_by_name(bill["bill_id"]), ) b.subject = subs # keep bill ID around self._bill_id_by_type[(chamber, re.findall(r"\d+", billid)[0])] = billid self.process_actions(bill["actions"], b) sponsors = bill["sponsors"][len("BY"):].strip() sponsors = sponsors.split(",") sponsors = [s.strip() for s in sponsors] for href in bill["bill_id_hrefs"]: b.add_version_link(href.text, href.attrib["href"], media_type="application/pdf") for sponsor in sponsors: b.add_sponsorship( sponsor, entity_type="person", classification="primary", primary=True, ) b.add_source(SEARCH_URL) yield b
def scrape_bill_list(self, url): bill_list = self._get_bill_list(url) for bill_info in bill_list: (bill_id, ) = bill_info.xpath("td[1]/font/input/@value") (sponsor, ) = bill_info.xpath("td[2]/font/input/@value") (subject, ) = bill_info.xpath("td[3]//text()") subject = subject.strip() chamber = self.CHAMBERS[bill_id[0]] if "B" in bill_id: bill_type = "bill" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" else: raise AssertionError( "Unknown bill type for bill '{}'".format(bill_id)) bill = Bill( bill_id, legislative_session=self.session, chamber=chamber, title="", classification=bill_type, ) if subject: bill.subject = [subject] if sponsor: bill.add_sponsorship( name=sponsor, entity_type="person", classification="primary", primary=True, ) bill.add_source(url) bill_url = ("http://alisondb.legislature.state.al.us/Alison/" "SESSBillStatusResult.aspx?BILL={}".format(bill_id)) bill.add_source(bill_url) bill_html = self._get_bill_response(bill_url) if bill_html is None: self.warning( "Bill {} has no webpage, and will be skipped".format( bill_id)) continue bill_doc = lxml.html.fromstring(bill_html) if bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]'): title = (bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]') [0].text_content().strip()) if not title: title = "[No title given by state]" bill.title = title session = "2022FS" if self.session == "2022s1" else self.session version_url_base = ( "http://alisondb.legislature.state.al.us/ALISON/" "SearchableInstruments/{0}/PrintFiles/{1}-".format( session, bill_id)) versions = bill_doc.xpath( '//table[@class="box_versions"]/tr/td[2]/font/text()') for version in versions: name = version if version == "Introduced": version_url = version_url_base + "int.pdf" elif version == "Engrossed": version_url = version_url_base + "eng.pdf" elif version == "Enrolled": version_url = version_url_base + "enr.pdf" else: raise NotImplementedError( "Unknown version type found: '{}'".format(name)) bill.add_version_link( name, version_url, media_type="application/pdf", on_duplicate="ignore", ) # Fiscal notes exist, but I can't figure out how to build their URL fiscal_notes = bill_doc.xpath( '//table[@class="box_fiscalnote"]')[1:] for fiscal_note in fiscal_notes: pass # Budget Isolation Resolutions are handled as extra actions/votes birs = bill_doc.xpath( '//div[@class="box_bir"]//table//table/tr')[1:] for bir in birs: bir_action = bir.xpath("td[1]")[0].text_content().strip() # Sometimes ALISON's database puts another bill's # actions into the BIR action list; ignore these if bill_id not in bir_action: self.warning( "BIR action found ({}) ".format(bir_action) + "that doesn't match the bill ID ({})".format(bill_id)) continue bir_date = datetime.datetime.strptime( bir.xpath("td[2]/font/text()")[0], self.DATE_FORMAT) bir_type = bir.xpath("td[1]/font/text()")[0].split(" ")[0] bir_chamber = self.CHAMBERS[bir_type[0]] bir_text = "{0}: {1}".format( bir_type, bir.xpath("td[3]/font/text()")[0].strip()) bill.add_action( bir_text, TIMEZONE.localize(bir_date), chamber=bir_chamber, classification="other", ) try: (bir_vote_id, ) = bir.xpath("td[4]/font/input/@value") except ValueError: bir_vote_id = "" bir_vote_id = bir_vote_id.strip() if bir_vote_id.startswith("Roll "): bir_vote_id = bir_vote_id.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=bir_type[0], bill_id="{0}%20for%20{1}".format(bir_type, bill_id), vote_id=bir_vote_id, vote_date=TIMEZONE.localize(bir_date), action_text=bir_text, ) actions = bill_doc.xpath( '//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:] action_date = None for action in actions: # If actions occur on the same day, only one date will exist if (action.xpath("td[1]/font/text()")[0].encode( "ascii", "ignore").strip()): action_date = datetime.datetime.strptime( action.xpath("td[1]/font/text()")[0], self.DATE_FORMAT) (action_chamber, ) = action.xpath("td[2]/font/text()") possible_amendment = action.xpath("td[3]/font/u/text()") if (len(possible_amendment) > 0 and not possible_amendment[0].strip() == ""): (amendment, ) = possible_amendment else: amendment = None (action_text, ) = action.xpath("td[4]/font/text()") action_type = _categorize_action(action_text) # check for occasional extra last row if not action_chamber.strip(): continue # The committee cell is just an abbreviation, so get its name actor = self.CHAMBERS[action_chamber] try: action_committee = (re.search( r".*? referred to the .*? committee on (.*?)$", action_text).group(1).strip()) except AttributeError: action_committee = "" if action_date is not None and action_text.strip(): act = bill.add_action( action_text, TIMEZONE.localize(action_date), chamber=actor, classification=action_type, ) if action_committee: act.add_related_entity(action_committee, entity_type="organization") try: vote_button = action.xpath("td[9]//text()")[0].strip() except IndexError: vote_button = "" if vote_button.startswith("Roll "): vote_id = vote_button.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=action_chamber, bill_id=bill_id, vote_id=vote_id, vote_date=TIMEZONE.localize(action_date), action_text=action_text, ) if amendment: session = "2021FS" if self.session == "2021s1" else self.session amend_url = ( "http://alisondb.legislature.state.al.us/ALISON/" "SearchableInstruments/{0}/PrintFiles/{1}.pdf".format( session, amendment)) amend_name = "Amd/Sub {}".format(amendment) bill.add_version_link( amend_name, amend_url, media_type="application/pdf", on_duplicate="ignore", ) yield bill
def scrape_bill(self, chamber, session, bill_id, url): page = self.lxmlize(url) (header, ) = page.xpath('//h3[@class="heading"]/text()') title = header.replace(bill_id, "").strip() if ".B. " in bill_id: bill_type = "bill" elif bill_id.startswith("H.R. ") or bill_id.startswith("S.R. "): bill_type = "resolution" elif ".C.R. " in bill_id: bill_type = "concurrent resolution" elif ".J.R. " in bill_id: bill_type = "joint resolution" for flag in SUB_BLACKLIST: if flag in bill_id: bill_id = bill_id.replace(flag, " ") bill_id = re.sub(r"\s+", " ", bill_id).strip().replace(".", "") bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(url) primary_info = page.xpath('//div[@id="billsponsordiv"]') for info in primary_info: try: (title, name) = [ x.strip() for x in info.xpath(".//text()") if x.strip() ] except ValueError: self.warning( "Could not find sponsor's name for {}".format(bill_id)) continue assert title == "Bill Sponsor:" name = name.replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship(name, classification="primary", entity_type="person", primary=True) floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()') floor_info = [x.strip() for x in floor_info if x.strip()] if len(floor_info) in (0, 1): # This indicates that no floor sponsor was found pass elif len(floor_info) == 2: assert floor_info[0] == "Floor Sponsor:" floor_sponsor = floor_info[1].replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship( floor_sponsor, classification="cosponsor", entity_type="person", primary=False, ) else: self.warning("Unexpected floor sponsor HTML found") versions = page.xpath( '//b[text()="Bill Text"]/following-sibling::ul/li/' 'a[text() and not(text()=" ")]') for version in versions: # sometimes the href is on the following <a> tag and the tag we # have has an onclick url = version.get("href") if not url: url = version.xpath("following-sibling::a[1]/@href")[0] bill.add_version_link(version.xpath("text()")[0].strip(), url, media_type="application/pdf") for related in page.xpath( '//b[text()="Related Documents "]/following-sibling::ul/li/' 'a[contains(@class,"nlink")]'): href = related.xpath("@href")[0] if ".fn.pdf" in href: bill.add_document_link("Fiscal Note", href, media_type="application/pdf") else: text = related.xpath("text()")[0] bill.add_document_link(text, href, media_type="application/pdf") subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill.subject = subjects if page.xpath('//div[@id="billStatus"]//table'): status_table = page.xpath('//div[@id="billStatus"]//table')[0] yield from self.parse_status(bill, status_table, chamber) yield bill
def scrape_bill(self, chamber, session, bill_id): bill_num = bill_id.split()[1] url = "%s/GetLegislation?biennium=%s&billNumber" "=%s" % ( self._base_url, self.biennium, bill_num, ) page = self.get(url) page = lxml.etree.fromstring(page.content) page = xpath(page, "//wa:Legislation")[0] xml_chamber = xpath(page, "string(wa:OriginalAgency)") chamber = self._chamber_map[xml_chamber] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)" ) bill_type = bill_type.lower() if bill_type == "gubernatorial appointment": return bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=[bill_type], ) fake_source = ( "http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % (bill_num, session[0:4]) ) bill.add_source(fake_source) try: for version in self.versions[bill_id]: bill.add_version_link( note=version["note"], url=version["url"], media_type=version["media_type"], ) except KeyError: self.warning("No versions were found for {}".format(bill_id)) try: for document in self.documents[bill_num]: bill.add_document_link( note=document["note"], url=document["url"], media_type=document["media_type"], ) except KeyError: pass self.scrape_sponsors(bill) self.scrape_actions(bill, chamber, fake_source) self.scrape_hearings(bill, bill_num) yield from self.scrape_votes(bill) bill.subject = list(set(self._subjects[bill_id])) yield bill
def test_full_bill(): create_jurisdiction() person = Person.objects.create(name="Adam Smith") lower = Organization.objects.create(jurisdiction_id="jid", name="House", classification="lower") Membership.objects.create(person_id=person.id, organization_id=lower.id) Organization.objects.create( jurisdiction_id="jid", name="Arbitrary Committee", classification="committee", parent=lower, ) oldbill = ScrapeBill( "HB 99", "1899", "Axe & Tack Tax Act", classification="tax bill", chamber="lower", ) bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", classification="tax bill", chamber="lower") bill.subject = ["taxes", "axes"] bill.add_identifier("SB 9") bill.add_title("Tack & Axe Tax Act") bill.add_action("introduced in house", "1900-04-01", chamber="lower") act = bill.add_action("sent to arbitrary committee", "1900-04-04", chamber="lower") act.add_related_entity( "arbitrary committee", "organization", _make_pseudo_id(name="Arbitrary Committee"), ) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship( "Adam Smith", classification="extra sponsor", entity_type="person", primary=False, entity_id=_make_pseudo_id(name="Adam Smith"), ) bill.add_sponsorship("Jane Smith", classification="lead sponsor", entity_type="person", primary=True) bill.add_abstract( "This is an act about axes and taxes and tacks.", note="official", date="1969-10-20", ) bill.add_document_link("Fiscal Note", "http://example.com/fn.pdf", media_type="application/pdf") bill.add_document_link("Fiscal Note", "http://example.com/fn.html", media_type="text/html") bill.add_version_link("Fiscal Note", "http://example.com/v/1", media_type="text/html") bill.add_source("http://example.com/source") # import bill BillImporter("jid").import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier="HB 1") assert b.from_organization.classification == "lower" assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ["taxes", "axes"] assert b.abstracts.get().note == "official" assert b.abstracts.get().date == "1969-10-20" # other_title, other_identifier added assert b.other_titles.get().title == "Tack & Axe Tax Act" assert b.other_identifiers.get().identifier == "SB 9" # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get( classification="lower") assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert actions[1].related_entities.get( ).organization == Organization.objects.get(classification="committee") # action computed fields assert b.first_action_date == "1900-04-01" assert b.latest_action_date == "1900-04-04" assert b.latest_action_description == "sent to arbitrary committee" # related_bills were added rb = b.related_bills.get() assert rb.identifier == "HB 99" # and bill got resolved assert rb.related_bill.identifier == "HB 99" # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 person = Person.objects.get(name="Adam Smith") for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def scrape_bill( self, session, chamber, bill_id, title, url, strip_sponsors=re.compile(r"\s*\(.{,50}\)\s*").sub, ): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) bill_type = self.bill_types[bill_id.split()[0][1:]] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(url) xpath = '//strong[contains(., "SUBJECT")]/../' "following-sibling::td/a/text()" bill.subject = page.xpath(xpath) for version in self.scrape_versions(session, chamber, page, bill_id): bill.add_version_link(**version) self.scrape_amendments(page, bill) # Resolution pages have different html. values = {} trs = page.xpath('//div[@id="bhistcontent"]/table/tr') for tr in trs: heading = tr.xpath("td/strong/text()") if heading: heading = heading[0] else: continue value = tr.text_content().replace(heading, "").strip() values[heading] = value # summary was always same as title # bill['summary'] = values['SUMMARY:'] # Add primary sponsor. primary = strip_sponsors("", values.get("LEAD SPONSOR:", "")) if primary: bill.add_sponsorship( name=primary, classification="primary", entity_type="person", primary=True, ) # Add cosponsors. if values.get("SPONSORS:"): sponsors = strip_sponsors("", values["SPONSORS:"]) sponsors = re.split(r", (?![A-Z]\.)", sponsors) for name in sponsors: name = name.strip(", \n\r") if name: # Fix name splitting bug where "Neale, D. Hall" match = re.search(r"(.+?), ([DM]\. Hall)", name) if match: for name in match.groups(): bill.add_sponsorship( name=name, classification="cosponsor", entity_type="person", primary=False, ) else: bill.add_sponsorship( name=name, classification="cosponsor", entity_type="person", primary=False, ) for link in page.xpath("//a[contains(@href, 'votes/house')]"): yield from self.scrape_house_vote(bill, link.attrib["href"]) for tr in reversed( page.xpath("//table[@class='tabborder']/descendant::tr")[1:]): tds = tr.xpath("td") if len(tds) < 3: continue chamber_letter = tds[0].text_content() chamber = {"S": "upper", "H": "lower"}[chamber_letter] # Index of date info no longer varies on resolutions. date = tds[2].text_content().strip() date = datetime.datetime.strptime(date, "%m/%d/%y").date() action = tds[1].text_content().strip() if action.lower().startswith("passed senate"): for href in tds[1].xpath("a/@href"): yield from self.scrape_senate_vote(bill, href, date) attrs = dict(chamber=chamber, description=action, date=date.strftime("%Y-%m-%d")) temp = self.categorizer.categorize(action) related_entities = [] for key, values in temp.items(): if key != "classification": for value in values: related_entities.append({"type": key, "name": value}) attrs.update(classification=temp["classification"], related_entities=related_entities) bill.add_action(**attrs) yield bill
def _parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller # (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = "%s/%s" % (self._house_base_url, url) # the URL is an iframed version now, so swap in for the actual bill page url = url.replace("Bill.aspx", "BillContent.aspx") url = url.replace("&code=R", "&code=R&style=new") # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new bill_page = self.get(url).text bill_page = lxml.html.fromstring(bill_page) bill_page.make_links_absolute(url) bill_id = bill_page.xpath('//*[@class="entry-title"]/div') if len(bill_id) == 0: self.info("WARNING: bill summary page is blank! (%s)" % url) self._bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath( '//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath("//table/tr") # if there is a cosponsor all the rows are pushed down one for the extra row # for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == "Co-Sponsor:": cosponsorOffset = 1 lr_label_tag = table_rows[3 + cosponsorOffset] assert lr_label_tag[0].text_content().strip() == "LR Number:" # bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if (table_rows[4 + cosponsorOffset][0].text_content().strip() == "Governor Action:"): lastActionOffset = 1 official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset] assert official_title_tag[0].text_content().strip() == "Bill String:" official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] bill_number = int(bill_id[3:].strip()) else: bill_number = int(bill_id[3:]) subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bill_desc == "": if bill_number <= 20: # blank bill titles early in session are approp. bills bill_desc = "Appropriations Bill" else: self.error("Blank title. Skipping. {} / {} / {}".format( bill_id, bill_desc, official_title)) return bill = Bill( bill_id, chamber="lower", title=bill_desc, legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_title(official_title, note="official") bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) # try: # bill_sponsor_link = table_rows[0][1][0].attrib['href'] # except IndexError: # return bill.add_sponsorship(bill_sponsor, entity_type="person", classification="primary", primary=True) # check for cosponsors (sponsors_url, ) = bill_page.xpath("//a[contains(@href, 'CoSponsors.aspx')]/@href") self._parse_cosponsors_from_bill(bill, sponsors_url) # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href']) # actions_link = re.sub("content", "print", actions_link) (actions_link, ) = bill_page.xpath("//a[contains(@href, 'BillActions.aspx')]/@href") yield from self._parse_house_actions(bill, actions_link) # get bill versions doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span') for doc_tag in reversed(doc_tags): doc = clean_text(doc_tag.text_content()) text_url = "%s%s" % (self._house_base_url, doc_tag[0].attrib["href"]) bill.add_document_link(doc, text_url, media_type="text/html") # get bill versions version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) for vurl in version_tag.xpath(".//a"): if vurl.text == "PDF": mimetype = "application/pdf" else: mimetype = "text/html" bill.add_version_link( version, vurl.attrib["href"], media_type=mimetype, on_duplicate="ignore", ) # house bill versions # everything between the row containing "Bill Text" in an h2 and the next div.DocHeaderRow version_rows = bill_page.xpath( '//div[h2[contains(text(),"Bill Text")]]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]' ) for row in version_rows: # some rows are just broken links, not real versions if row.xpath('.//div[contains(@class,"textType")]/a/@href'): version = row.xpath( './/div[contains(@class,"textType")]/a/text()')[0].strip() path = row.xpath( './/div[contains(@class,"textType")]/a/@href')[0].strip() if ".pdf" in path: mimetype = "application/pdf" else: mimetype = "text/html" bill.add_version_link(version, path, media_type=mimetype, on_duplicate="ignore") # house bill summaries # everything between the row containing "Bill Summary" in an h2 # and the next div.DocHeaderRow summary_rows = bill_page.xpath( '//div[h2[contains(text(),"Bill Summary")]]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]' ) # if there are no amedments, we need a different xpath for summaries if not summary_rows: summary_rows = bill_page.xpath( '//div[h2[contains(text(),"Bill Summary")]]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(summary_rows): version = row.xpath( './/div[contains(@class,"textType")]/a/text()')[0].strip() if version: path = row.xpath( './/div[contains(@class,"textType")]/a/@href')[0].strip() summary_name = "Bill Summary ({})".format(version) if ".pdf" in path: mimetype = "application/pdf" else: mimetype = "text/html" bill.add_document_link(summary_name, path, media_type=mimetype, on_duplicate="ignore") # house bill amendments amendment_rows = bill_page.xpath( '//div[h2[contains(text(),"Amendment")]]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(amendment_rows): version = row.xpath( './/div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip( ) path = row.xpath( './/div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip( ) summary_name = "Amendment {}".format(version) defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]') if defeated_icon: summary_name = "{} (Defeated)".format(summary_name) adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]') if adopted_icon: summary_name = "{} (Adopted)".format(summary_name) distributed_icon = row.xpath( './/img[contains(@title,"Distributed")]') if distributed_icon: summary_name = "{} (Distributed)".format(summary_name) if ".pdf" in path: mimetype = "application/pdf" else: mimetype = "text/html" bill.add_version_link(summary_name, path, media_type=mimetype, on_duplicate="ignore") yield bill
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.get(url).text) except scrapelib.HTTPError as e: self.warning("error (%s) fetching %s, skipping" % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if not title: self.warning("blank bill on %s - skipping", url) return if "JR" in bill_id: bill_type = ["joint resolution"] elif "CR" in bill_id: bill_type = ["concurrent resolution"] elif "R" in bill_id: bill_type = ["resolution"] else: bill_type = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(url) bill.subject = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if "author not found" in name.lower(): continue if ":" in name: raise Exception(name) if "otherAuth" in link.attrib["id"]: bill.add_sponsorship( name, classification="cosponsor", entity_type="person", primary=False, ) else: bill.add_sponsorship(name, classification="primary", entity_type="person", primary=True) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == "None": continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == "H": actor = "lower" elif actor == "S": actor = "upper" attrs = self.categorizer.categorize(action) related_entities = [] for item in attrs["committees"]: related_entities.append({"type": "committee", "name": item}) for item in attrs["legislators"]: related_entities.append({"type": "legislator", "name": item}) bill.add_action( description=action, date=date.strftime("%Y-%m-%d"), chamber=actor, classification=attrs["classification"], related_entities=related_entities, ) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] # Keep track of already seen versions to prevent processing duplicates. version_urls = [] for link in version_table.xpath(".//a[contains(@href, '.PDF')]"): version_url = link.attrib["href"] if version_url in version_urls: self.warning("Skipping duplicate version URL.") continue else: version_urls.append(version_url) name = link.text.strip() if re.search("COMMITTEE REPORTS|SCHEDULED CCR", version_url, re.IGNORECASE): bill.add_document_link(note=name, url=version_url, media_type="application/pdf") continue bill.add_version_link(note=name, url=version_url, media_type="application/pdf") self.scrape_amendments(bill, page) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): if "HT_" not in link.attrib["href"]: yield from self.scrape_votes( bill, self.urlescape(link.attrib["href"])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = bill.title == "Short Title Not Found." if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. yield bill
def scrape_bill(self, chamber, session, bill_id): # try and get bill for the first year of the session biennium url = "http://legislature.mi.gov/doc.aspx?%s-%s" % ( session[:4], bill_id.replace(" ", "-"), ) html = self.get(url).text # Otherwise, try second year of the session biennium if ( "Page Not Found" in html or "The bill you are looking for is not available yet" in html ): url = "http://legislature.mi.gov/doc.aspx?%s-%s" % ( session[-4:], bill_id.replace(" ", "-"), ) html = self.get(url).text if ( "Page Not Found" in html or "The bill you are looking for is not available yet" in html ): self.warning("Cannot open bill page for {}; skipping".format(bill_id)) return doc = lxml.html.fromstring(html) doc.make_links_absolute("http://legislature.mi.gov") title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[ 0 ].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(" ")[0][1:]] bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type) bill.add_source(url) # sponsors sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a') for sponsor in sponsors: name = sponsor.text.replace(u"\xa0", " ") # sometimes district gets added as a link if name.isnumeric(): continue if len(sponsors) > 1: classification = ( "primary" if sponsor.tail and "primary" in sponsor.tail else "cosponsor" ) else: classification = "primary" bill.add_sponsorship( name=name.strip(), chamber=chamber, entity_type="person", primary=classification == "primary", classification=classification, ) bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath("td") # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() try: date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%y")) except ValueError: try: date = TIMEZONE.localize( datetime.datetime.strptime(date, "%m/%d/%Y") ) except ValueError: self.warning( "{} has action with invalid date. Skipping Action".format( bill_id ) ) continue # use journal for actor # then fall back to upper/lower case # Journal entries are often posted with 'Expected Soon' as the cite, # then changed to the journal entry. if "SJ" in journal.upper(): actor = "upper" elif "HJ" in journal.upper(): actor = "lower" elif action.split()[0].islower(): actor = "lower" elif action.split()[0].isupper(): actor = "upper" else: actor = "legislature" classification = categorize_action(action) bill.add_action(action, date, chamber=actor, classification=classification) # check if action mentions a sub submatch = re.search( r"WITH SUBSTITUTE\s+([\w\-\d]+)", action, re.IGNORECASE ) if submatch and tds[2].xpath("a"): version_url = tds[2].xpath("a/@href")[0] version_name = tds[2].xpath("a/text()")[0].strip() version_name = "Substitute {}".format(version_name) self.info("Found Substitute {}".format(version_url)) if version_url.lower().endswith(".pdf"): mimetype = "application/pdf" elif version_url.lower().endswith(".htm"): mimetype = "text/html" bill.add_version_link(version_name, version_url, media_type=mimetype) # check if action mentions a vote rcmatch = re.search(r"Roll Call # (\d+)", action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath("a/@href") if journal_link: objectname = journal_link[0].rsplit("=", 1)[-1] chamber_name = {"upper": "Senate", "lower": "House"}[actor] vote_url = BASE_URL + "/documents/%s/Journal/%s/htm/%s.htm" % ( session, chamber_name, objectname, ) results = self.parse_roll_call(vote_url, rc_num, session) if results is not None: vote_passed = len(results["yes"]) > len(results["no"]) vote = VoteEvent( start_date=date, chamber=actor, bill=bill, motion_text=action, result="pass" if vote_passed else "fail", classification="passage", ) # check the expected counts vs actual count = re.search(r"YEAS (\d+)", action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results["yes"]): self.warning( "vote count mismatch for %s %s, %d != %d" % (bill_id, action, count, len(results["yes"])) ) count = re.search(r"NAYS (\d+)", action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results["no"]): self.warning( "vote count mismatch for %s %s, %d != %d" % (bill_id, action, count, len(results["no"])) ) vote.set_count("yes", len(results["yes"])) vote.set_count("no", len(results["no"])) vote.set_count("other", len(results["other"])) possible_vote_results = ["yes", "no", "other"] for pvr in possible_vote_results: for name in results[pvr]: if session == "2017-2018": names = name.split("\t") for n in names: vote.vote(pvr, name.strip()) else: # Prevents voter names like "House Bill No. 4451, entitled" and other sentences if len(name.split()) < 5: vote.vote(pvr, name.strip()) vote.add_source(vote_url) yield vote else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'): parsed = self.parse_doc_row(row) if parsed: name, url = parsed if url.endswith(".pdf"): mimetype = "application/pdf" elif url.endswith(".htm"): mimetype = "text/html" bill.add_version_link(name, url, media_type=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) yield bill
def parse_bill(self, chamber, session, bill_id, url): try: page = self.lxmlize(url) except scrapelib.HTTPError as e: self.logger.warning(e) return withdrawn = False if self.parse_bill_field(page, "Last Action") != "": last_action = self.parse_bill_field(page, "Last Action").xpath("text()")[0] if "WITHDRAWN" in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) withdrawn = True if withdrawn: title = "Withdrawn." else: title = self.parse_bill_field(page, "Title").text_content() if "CR" in bill_id: bill_type = "concurrent resolution" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" else: bill_type = "bill" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.subject = self._subjects[bill_id] bill.add_source(url) self.parse_versions(page, bill) self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) self.parse_proposed_amendments(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib["href"] mimetype = get_media_type(source_url) bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) # only grab links in the first table, because proposed amendments have sponsors that are not bill sponsors. for link in page.xpath( "//div[contains(@class,'bill-table')][1]//td/span/a[contains(@href, 'Legislator-Profile')]" ): bill.add_sponsorship( link.text.strip(), classification="primary", entity_type="person", primary=True, ) if page.xpath("//th[contains(text(),'Votes')]"): vote_url = page.xpath("//a[contains(text(),'Vote History')]/@href")[0] yield from self.scrape_votes(vote_url, bill, chamber) bdr_no = self.parse_bill_field(page, "Bill Request Number") if bdr_no != "" and bdr_no.xpath("text()"): bdr = bdr_no.xpath("text()")[0].strip() bill.extras["BDR"] = bdr if self.parse_bill_field(page, "Summary of Original Version") != "": summary = ( self.parse_bill_field(page, "Summary of Original Version") .text_content() .strip() ) bill.add_abstract(summary, note="Summary of Original Version") if withdrawn: action = self.parse_bill_field(page, "Last Action").text_content().strip() wd_date = re.findall(r"\d{2}\/\d{2}\/\d+", action)[0] wd_date = dateutil.parser.parse(wd_date).date() bill.add_action( action, wd_date, chamber=chamber, classification="withdrawal" ) yield bill
def scrape_bill_type( self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex(), ): bills = (self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr)) archive_year = int(session[0:4]) not_archive_year = archive_year >= 2009 for bill in bills: bill_session = session if bill.session_num != "0": bill_session += " Special Session %s" % bill.session_num bill_id = bill.short_bill_id if bill_id.strip() == "SB77" and session == "20052006": continue fsbill = Bill(bill_id, bill_session, title="", chamber=chamber) if (bill_id.startswith("S") and chamber == "lower") or (bill_id.startswith("A") and chamber == "upper"): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # Construct a fake source url source_url = ("http://leginfo.legislature.ca.gov/faces/" "billNavClient.xhtml?bill_id=%s") % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type="text/html") title = "" type_ = ["bill"] subject = "" all_titles = set() summary = "" # Get digest test (aka "summary") from latest version. if bill.versions and not_archive_year: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = "//caml:DigestText/xhtml:p" els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r"\s+", " ", t) t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t) chunks.append(t) summary = "\n\n".join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime("%m/%d/%y") version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type="application/pdf", date=version_date.date(), ) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ("AB", "SB"): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len( version.short_title) and not version.title.lower( ).startswith("an act"): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == "Yes": type_.append("appropriation") tags = [] if version.fiscal_committee == "Yes": tags.append("fiscal committee") if version.local_program == "Yes": tags.append("local program") if version.urgency == "Yes": tags.append("urgency") if version.taxlevy == "Yes": tags.append("tax levy") if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note="summary") fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras["impact_clause"] = impact_clause fsbill.extras["tags"] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == "Y", entity_type="person", ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r"(Assembly|Senate)($| \(Floor)", actor) if match: actor = { "Assembly": "lower", "Senate": "upper" }[match.group(1)] elif actor.startswith("Governor"): actor = "executive" else: def replacer(matchobj): if matchobj: return { "Assembly": "lower", "Senate": "upper" }[matchobj.group()] else: return matchobj.group() actor = re.sub(r"^(Assembly|Senate)", replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r"\s+", " ", act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r"Com[s]?. on", action.action) and not matched_abbrs: msg = "Failed to extract committee abbr from %r." self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ("Mapping contains no committee name for " "abbreviation %r. Action text was %r.") args = (abbr, action.action) self.warning(msg % args) committees = filter(None, committees) kwargs["committees"] = committees code = re.search(r"C[SXZ]\d+", actor) if code is not None: code = code.group() kwargs["actor_info"] = {"committee_code": code} if not_archive_year: assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace("Coms. on ", "") act_str = act_str.replace("Com. on " + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith("."): act_str = act_str + "." # Determine which chamber the action originated from. changed = False for committee_chamber in ["upper", "lower", "legislature"]: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = "legislature" if actor != action.actor: actor_info = kwargs.get("actor_info", {}) actor_info["details"] = action.actor kwargs["actor_info"] = actor_info # Add strings for related legislators, if any. rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+" legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs["legislators"] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime("%Y-%m-%d"), chamber=actor, classification=kwargs["classification"], ) for committee in kwargs.get("committees", []): action.add_related_entity(committee, entity_type="organization") seen_actions.add((actor, act_str, date)) source_url = ( "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?" ) source_url += f"bill_id={session}{bill.session_num}{fsbill.identifier}" # Votes for non archived years if archive_year > 2009: for vote_num, vote in enumerate(bill.votes): if vote.vote_result == "(PASS)": result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(" ")[0].lower() if first_part in ["asm", "assembly"]: vote_chamber = "lower" # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith("sen"): vote_chamber = "upper" # vote_location = ' '.join(full_loc.split(' ')[1:]) else: # raise ScrapeError("Bad location: %s" % full_loc) # To uncomment continue if vote.motion: motion = vote.motion.motion_text or "" else: motion = "" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" motion = motion.strip() motion = re.compile(r"(\w+)( Extraordinary)? Session$", re.IGNORECASE).sub("", motion) motion = re.compile(r"^(Senate|Assembly) ", re.IGNORECASE).sub("", motion) motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ", "", motion) motion = re.sub(r" \(\w+\)$", "", motion) motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "", motion) motion = re.sub( r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? " r"Urgency Clause$", "(Urgency Clause)", motion, ) motion = re.sub(r"\s+", " ", motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result="pass" if result else "fail", classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {"threshold": vote.threshold} fsvote.add_source(source_url) fsvote.pupa_id = source_url + "#" + str(vote_num) rc = {"yes": [], "no": [], "other": []} for record in vote.votes: if record.vote_code == "AYE": rc["yes"].append(record.legislator_name) elif record.vote_code.startswith("NO"): rc["no"].append(record.legislator_name) else: rc["other"].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote if len(bill.votes) > 0 and archive_year <= 2009: vote_page_url = ( "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?" ) vote_page_url += ( f"bill_id={session}{bill.session_num}{fsbill.identifier}") # parse the bill data page, finding the latest html text data = self.get(vote_page_url).content doc = html.fromstring(data) doc.make_links_absolute(vote_page_url) num_of_votes = len(doc.xpath("//div[@class='status']")) for vote_section in range(1, num_of_votes + 1): lines = doc.xpath( f"//div[@class='status'][{vote_section}]//div[@class='statusRow']" ) date, result, motion, vtype, location = "", "", "", "", "" votes = {} for line in lines: line = line.text_content().split() if line[0] == "Date": date = line[1] date = datetime.datetime.strptime(date, "%m/%d/%y") date = self._tz.localize(date) elif line[0] == "Result": result = "pass" if "PASS" in line[1] else "fail" elif line[0] == "Motion": motion = " ".join(line[1:]) elif line[0] == "Location": location = " ".join(line[1:]) elif len(line) > 1: if line[0] == "Ayes" and line[1] != "Count": votes["yes"] = line[1:] elif line[0] == "Noes" and line[1] != "Count": votes["no"] = line[1:] elif line[0] == "NVR" and line[1] != "Count": votes["not voting"] = line[1:] # Determine chamber based on location first_part = location.split(" ")[0].lower() vote_chamber = "" if first_part in ["asm", "assembly"]: vote_chamber = "lower" elif first_part.startswith("sen"): vote_chamber = "upper" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" if len(motion) > 0: fsvote = VoteEvent( motion_text=motion, start_date=date, result=result, classification=vtype, chamber=vote_chamber, bill=fsbill, ) fsvote.add_source(vote_page_url) fsvote.pupa_id = vote_page_url + "#" + str( vote_section) for how_voted, voters in votes.items(): for voter in voters: voter = voter.replace(",", "") fsvote.vote(how_voted, voter) yield fsvote yield fsbill self.session.expire_all()
def parse_bill(self, chamber, session, bill_id, url): try: page = self.lxmlize(url) except scrapelib.HTTPError as e: self.logger.warning(e) return if self.parse_bill_field(page, "Last Action") != "": last_action = self.parse_bill_field( page, "Last Action").xpath("text()")[0] if "WITHDRAWN" in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) return title = self.parse_bill_field(page, "Title").text_content() if "CR" in bill_id: bill_type = "concurrent resolution" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" else: bill_type = "bill" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.subject = self._subjects[bill_id] bill.add_source(url) version_ct = self.parse_versions(page, bill) if version_ct < 1: # Bill withdrawn self.logger.warning("Bill withdrawn.") return self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) self.parse_proposed_amendments(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib["href"] mimetype = get_media_type(source_url) bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) for link in page.xpath( "//td/span/a[contains(@href, 'Legislator-Profile')]"): bill.add_sponsorship( link.text.strip(), classification="primary", entity_type="person", primary=True, ) if page.xpath("//th[contains(text(),'Votes')]"): vote_url = page.xpath( "//a[contains(text(),'Vote History')]/@href")[0] yield from self.scrape_votes(vote_url, bill, chamber) bdr_no = self.parse_bill_field(page, "Bill Request Number") if bdr_no != "" and bdr_no.xpath("text()"): bdr = bdr_no.xpath("text()")[0].strip() bill.extras["BDR"] = bdr yield bill
def scrape_actions(self, session, href): page = self.lxmlize(href) (bid,) = page.xpath('//h1[@id="page-title"]/text()') bid = re.sub(r"^Bill Actions for ", "", bid) subjects = self.subjects.get(bid, []) # some pages say "Measure Number Breakdown", others "Bill..." table = page.xpath("//table[contains(@summary, 'Number Breakdown')]") table = table[0] ttrows = page.xpath("//div[@id='application']/p") descr = ttrows[-2] title = re.sub(r"\s+", " ", descr.text_content()).strip() ttrows = ttrows[:-1] chamber = {"H": "lower", "S": "upper"}[bid[0]] type_ = bid[1:3] bill_type = "bill" if type_.startswith("B"): bill_type = "bill" if type_.startswith("R"): bill_type = "resolution" if type_ == "CR": bill_type = "concurrent resolution" bill = Bill( bid, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.subject = subjects bill.add_source(href) for row in ttrows: if isinstance(row, lxml.html.HtmlComment): continue # ignore HTML comments, no text_content() sponsors = row.text_content().strip() sinf = re.match( r"(?i)introduced by( (rep\.|sen\.))? (?P<sponsors>.*)", sponsors ) if sinf: sponsors = sinf.groupdict() for sponsor in [x.strip() for x in sponsors["sponsors"].split(",")]: bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True, ) dt = None oldchamber = "other" for row in table.xpath(".//tr"): if row.text_content().strip() == "": continue if "Meeting Description" in [x.strip() for x in row.xpath(".//th/text()")]: continue row = row.xpath("./*") row = [x.text_content().strip() for x in row] if len(row) > 3: row = row[:3] date, chamber, action = row try: chamber = {"House": "lower", "Senate": "upper"}[chamber] oldchamber = chamber except KeyError: chamber = oldchamber if date != "": dt = datetime.strptime("%s %s" % (date, self.year), "%m/%d %Y") classif = self.categorizer.categorize(action) bill.add_action( chamber=chamber, description=action, date=dt.strftime("%Y-%m-%d"), classification=classif["classification"], ) version_url = page.xpath("//a[contains(text(), 'Versions')]") if len(version_url) == 1: href = version_url[0].attrib["href"] bill = self.scrape_versions(bill, href) yield bill