def scrape_votes_old(self, bill, billname, session): vote_url = ("" + session + "_" + billname) page = self.get(vote_url).text page = lxml.html.fromstring(page) for jlink in page.xpath("//a[contains(@href, 'JournalText')]"): date = self._tz.localize( datetime.datetime.strptime(jlink.text, "%m/%d/%Y")).date() date = "{:%Y-%m-%d}".format(date) details = jlink.xpath("string(../../../td[2])") chamber = details.split(" - ")[0] if chamber == "House": chamber = "lower" elif chamber == "Senate": chamber = "upper" else: raise ScrapeError("Bad chamber: %s" % chamber) motion = details.split(" - ")[1].split("\n")[0].strip() vote_row = jlink.xpath("../../..")[0].getnext() yea_div = vote_row.xpath("td/font/div[contains(@id, 'Yea')]")[0] yeas = [] for td in yea_div.xpath("table/tr/td"): name = td.xpath("string()") if name: yeas.append(name) no_div = vote_row.xpath("td/font/div[contains(@id, 'Nay')]")[0] nays = [] for td in no_div.xpath("table/tr/td"): name = td.xpath("string()") if name: nays.append(name) yes_count = len(yeas) no_count = len(nays) vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result="pass" if yes_count > no_count else "fail", bill=bill, classification="passage", ) for yes in yeas: vote.yes(yes) for no in nays: vote.add_source(vote_url) yield vote
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if bill_title is None or "Bill does not exist" in history_xml: self.warning("Bill does not appear to exist") return bill_id = " ".join(root.attrib["bill"].split(" ")[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == "B": bill_type = ["bill"] elif bill_id[1] == "R": bill_type = ["resolution"] elif bill_id[1:3] == "CR": bill_type = ["concurrent resolution"] elif bill_id[1:3] == "JR": bill_type = ["joint resolution"] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) bill.add_source(history_url) bill_id_for_url = bill_id.replace(" ", "") bill.add_source( f"{session}&Bill={bill_id_for_url}" ) for subject in root.iterfind("subjects/subject"): bill.add_subject(subject.text.strip()) for version in root.iterfind( "billtext/docTypes/bill/versions/version"): if not version: continue note = version.find("versionDescription").text html_url = version.find("WebHTMLURL").text bill.add_version_link(note=note, url=html_url, media_type="text/html") pdf_url = version.find("WebPDFURL").text bill.add_version_link(note=note, url=pdf_url, media_type="application/pdf") for analysis in root.iterfind( "billtext/docTypes/analysis/versions/version"): if not analysis: continue description = analysis.find("versionDescription").text html_url = analysis.find("WebHTMLURL").text bill.add_document_link( note="Analysis ({})".format(description), url=html_url, media_type="text/html", ) for fiscal_note in root.iterfind( "billtext/docTypes/fiscalNote/versions/version"): if not fiscal_note: continue description = fiscal_note.find("versionDescription").text html_url = fiscal_note.find("WebHTMLURL").text bill.add_document_link( note="Fiscal Note ({})".format(description), url=html_url, media_type="text/html", ) witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document_link( note="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], media_type="text/html", ) for action in root.findall("actions/action"): act_date = datetime.datetime.strptime(action.findtext("date"), "%m/%d/%Y").date() action_number = action.find("actionNumber").text actor = { "H": "lower", "S": "upper", "E": "executive" }[action_number[0]] desc = action.findtext("description").strip() if desc == "Scheduled for public hearing on . . .": self.warning("Skipping public hearing action with no date") continue atype = _categorize_action(desc) act = bill.add_action( action.findtext("description"), act_date, chamber=actor, classification=atype, ) if atype and "referral-committee" in atype: repls = ["Referred to", "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() act.add_related_entity(name=ctty, entity_type="organization") for author in root.findtext("authors").split(" | "): if author != "": bill.add_sponsorship(author, classification="primary", entity_type="person", primary=True) for coauthor in root.findtext("coauthors").split(" | "): if coauthor != "": bill.add_sponsorship( coauthor, classification="cosponsor", entity_type="person", primary=False, ) for sponsor in root.findtext("sponsors").split(" | "): if sponsor != "": bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True, ) for cosponsor in root.findtext("cosponsors").split(" | "): if cosponsor != "": bill.add_sponsorship( cosponsor, classification="cosponsor", entity_type="person", primary=False, ) if root.findtext("companions"): self._get_companion(bill) yield bill
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if bill_title is None or "Bill does not exist" in history_xml: self.warning("Bill does not appear to exist") return bill_id = " ".join(root.attrib["bill"].split(" ")[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == "B": bill_type = ["bill"] elif bill_id[1] == "R": bill_type = ["resolution"] elif bill_id[1:3] == "CR": bill_type = ["concurrent resolution"] elif bill_id[1:3] == "JR": bill_type = ["joint resolution"] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) bill.add_source(history_url) for subject in root.iterfind("subjects/subject"): bill.add_subject(subject.text.strip()) versions = [x for x in self.versions if x[0] == bill_id] for version in versions: bill.add_version_link( note=self.NAME_SLUGS[version[1][-5]], url=version[1], media_type="text/html", ) analyses = [x for x in self.analyses if x[0] == bill_id] for analysis in analyses: bill.add_document_link( note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]), url=analysis[1], media_type="text/html", ) fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id] for fiscal_note in fiscal_notes: bill.add_document_link( note="Fiscal Note ({})".format( self.NAME_SLUGS[fiscal_note[1][-5]]), url=fiscal_note[1], media_type="text/html", ) witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document_link( note="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], media_type="text/html", ) for action in root.findall("actions/action"): act_date = datetime.datetime.strptime(action.findtext("date"), "%m/%d/%Y").date() action_number = action.find("actionNumber").text actor = { "H": "lower", "S": "upper", "E": "executive" }[action_number[0]] desc = action.findtext("description").strip() if desc == "Scheduled for public hearing on . . .": self.warning("Skipping public hearing action with no date") continue introduced = False if desc == "Amended": atype = "amendment-passage" elif desc == "Amendment(s) offered": atype = "amendment-introduction" elif desc == "Amendment amended": atype = "amendment-amendment" elif desc == "Amendment withdrawn": atype = "amendment-withdrawal" elif desc == "Passed" or desc == "Adopted": atype = "passage" elif re.match(r"^Received (by|from) the", desc): if "Secretary of the Senate" not in desc: atype = "introduction" else: atype = "filing" elif desc.startswith("Sent to the Governor"): # But what if it gets lost in the mail? atype = "executive-receipt" elif desc.startswith("Signed by the Governor"): atype = "executive-signature" elif desc.startswith("Effective on"): atype = "became-law" elif desc == "Vetoed by the Governor": atype = "executive-veto" elif desc == "Read first time": atype = ["introduction", "reading-1"] introduced = True elif desc == "Read & adopted": atype = ["passage"] if not introduced: introduced = True atype.append("introduction") elif desc == "Passed as amended": atype = "passage" elif desc.startswith("Referred to") or desc.startswith( "Recommended to be sent to "): atype = "referral-committee" elif desc == "Reported favorably w/o amendment(s)": atype = "committee-passage" elif desc == "Filed": atype = "filing" elif desc == "Read 3rd time": atype = "reading-3" elif desc == "Read 2nd time": atype = "reading-2" elif desc.startswith("Reported favorably"): atype = "committee-passage-favorable" else: atype = None act = bill.add_action( action.findtext("description"), act_date, chamber=actor, classification=atype, ) if atype and "referral-committee" in atype: repls = ["Referred to", "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() act.add_related_entity(name=ctty, entity_type="organization") for author in root.findtext("authors").split(" | "): if author != "": bill.add_sponsorship(author, classification="primary", entity_type="person", primary=True) for coauthor in root.findtext("coauthors").split(" | "): if coauthor != "": bill.add_sponsorship( coauthor, classification="cosponsor", entity_type="person", primary=False, ) for sponsor in root.findtext("sponsors").split(" | "): if sponsor != "": bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True, ) for cosponsor in root.findtext("cosponsors").split(" | "): if cosponsor != "": bill.add_sponsorship( cosponsor, classification="cosponsor", entity_type="person", primary=False, ) if root.findtext("companions"): self._get_companion(bill) yield bill
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.lxmlize(url) if re.match(r"^(S|H)B ", bill_id): btype = ["bill"] elif re.match(r"(S|H)C ", bill_id): btype = ["commemoration"] elif re.match(r"(S|H)JR ", bill_id): btype = ["joint resolution"] elif re.match(r"(S|H)CR ", bill_id): btype = ["concurrent resolution"] else: btype = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype, ) bill.add_source(url) version_rows = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillVersions"]' + "/section/table/tbody/tr" ) assert len(version_rows) > 0 for row in version_rows: (date,) = row.xpath('./td[@data-title="Date"]/text()') date = date.strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() (html_note,) = row.xpath('./td[@data-title="HTML"]/a/text()') (html_link,) = row.xpath('./td[@data-title="HTML"]/a/@href') (pdf_note,) = row.xpath('./td[@data-title="PDF"]/a/text()') (pdf_link,) = row.xpath('./td[@data-title="PDF"]/a/@href') assert html_note == pdf_note note = html_note bill.add_version_link( note, html_link, date=date, media_type="text/html", on_duplicate="ignore", ) bill.add_version_link( note, pdf_link, date=date, media_type="application/pdf", on_duplicate="ignore", ) sponsor_links = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' + '/label[contains(text(), "Sponsors:")]' + "/following-sibling::div[1]/p/a" ) for link in sponsor_links: if link.attrib["href"].startswith(""): sponsor_type = "person" elif link.attrib["href"].startswith( "" ): sponsor_type = "organization" else: raise ScrapeError( "Found unexpected sponsor, URL: " + link.attrib["href"] ) bill.add_sponsorship( link.text, classification="primary", primary=True, entity_type=sponsor_type, ) actor = chamber use_row = False for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"): # Some tables have null rows, that are just `<tr></tr>` # Eg: if row.text_content() == "": self.debug("Skipping action table row that is completely empty") continue if "Date" in row.text_content() and "Action" in row.text_content(): use_row = True continue elif not use_row: continue action = row.xpath("string(td[2])").strip() atypes = [] if action.startswith("First read"): atypes.append("introduction") atypes.append("reading-1") if re.match(r"Signed by (?:the\s)*Governor", action, re.IGNORECASE): atypes.append("executive-signature") actor = "executive" match = re.match(r"(.*) Do Pass( Amended)?, (Passed|Failed)", action) if match: if in ["Senate", "House of Representatives"]: first = "" else: first = "committee-" if == "passed": second = "passage" elif == "failed": second = "failure" atypes.append("%s%s" % (first, second)) if "referred to" in action.lower(): atypes.append("referral-committee") if "Motion to amend, Passed Amendment" in action: atypes.append("amendment-introduction") atypes.append("amendment-passage") if row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]'): amd = row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]')[0] version_name = amd.xpath("string(.)") version_url = amd.xpath("@href")[0] if "htm" in version_url: mimetype = "text/html" elif "pdf" in version_url: mimetype = "application/pdf" bill.add_version_link( version_name, version_url, media_type=mimetype, on_duplicate="ignore", ) if "Veto override, Passed" in action: atypes.append("veto-override-passage") elif "Veto override, Failed" in action: atypes.append("veto-override-failure") if "Delivered to the Governor" in action: atypes.append("executive-receipt") match = re.match("First read in (Senate|House)", action) if match: if == "Senate": actor = "upper" else: actor = "lower" date = row.xpath("string(td[1])").strip() match = re.match(r"\d{2}/\d{2}/\d{4}", date) if not match: self.warning("Bad date: %s" % date) continue date = datetime.datetime.strptime(date, "%m/%d/%Y").date() for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"): yield from self.scrape_vote(bill, date, link.attrib["href"]) if action: bill.add_action(action, date, chamber=actor, classification=atypes) for link in page.xpath("//a[contains(@href, 'Keyword')]"): bill.add_subject(link.text.strip()) yield bill
def scrape_vote(self, bill, date, url): page = self.get(url).text page = lxml.html.fromstring(page) header = page.xpath("string(//h3[contains(@id, 'hdVote')])") if "No Bill Action" in header: self.warning("bad vote header -- skipping") return location = header.split(", ")[1] if location.startswith("House"): chamber = "lower" elif location.startswith("Senate"): chamber = "upper" elif location.startswith("Joint"): chamber = "legislature" else: raise ScrapeError("Bad chamber: %s" % location) motion = ", ".join(header.split(", ")[2:]).strip() if motion: # If we can't detect a motion, skip this vote yes_count = int(page.xpath("string(//span[contains(@id, 'tdAyes')])")) no_count = int(page.xpath("string(//span[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//span[contains(@id, 'tdExcused')])") ) absent_count = int(page.xpath("string(//span[contains(@id, 'tdAbsent')])")) passed = yes_count > no_count if motion.startswith("Do Pass"): type = "passage" elif motion == "Concurred in amendments": type = "amendment" elif motion == "Veto override": type = "veto_override" else: type = "other" vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", classification=type, bill=bill, ) # The vote page URL has a unique ID # However, some votes are "consent calendar" events, # and relate to the passage of _multiple_ bills # These can't be modeled yet in Pupa, but for now we can # append a bill ID to the URL that forms the `pupa_id` # vote.pupa_id = "{}#{}".format(url, bill.identifier.replace(" ", "")) vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"): option_or_person = td.text.strip() if option_or_person in ("Aye", "Yea"): vote.yes(td.getprevious().text.strip()) elif option_or_person == "Nay": elif option_or_person == "Excused":"excused", td.getprevious().text.strip()) elif option_or_person == "Absent":"absent", td.getprevious().text.strip()) yield vote