def record_votes(root, session, chamber): for el in root.xpath("//div{}".format("".join(vote_selectors))): mv = MaybeVote(el) if not mv.is_valid: continue v = VoteEvent( chamber=chamber, start_date=None, motion_text="passage" if mv.passed else "other", result="pass" if mv.passed else "fail", classification="passage" if mv.passed else None, legislative_session=session[0:2], bill=mv.bill_id, bill_chamber=mv.chamber, ) v.set_count("yes", mv.yeas or 0) v.set_count("no", mv.nays or 0) v.set_count("not voting", mv.present or 0) for each in mv.votes["yeas"]: each = clean_vote_name(each) v.yes(each) for each in mv.votes["nays"]: each = clean_vote_name(each) v.no(each) for each in mv.votes["present"]: each = clean_vote_name(each) v.vote("not voting", each) for each in mv.votes["absent"]: each = clean_vote_name(each) v.vote("absent", each) yield v
def _parse_senate_votes(self, vote_data, bill, url): vote_datetime = datetime.datetime.strptime(vote_data["voteDate"], "%Y-%m-%d") if vote_data["voteType"] == "FLOOR": motion = "Floor Vote" elif vote_data["voteType"] == "COMMITTEE": motion = "{} Vote".format(vote_data["committee"]["name"]) else: raise ValueError("Unknown vote type encountered.") if vote_data["version"]: motion += " - Version: " + vote_data["version"] vote = VoteEvent( chamber="upper", start_date=vote_datetime.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="fail", bill=bill, ) vote.add_source(url) vote_rolls = vote_data["memberVotes"]["items"] yes_count, no_count, other_count = 0, 0, 0 # Count all yea votes. if "items" in vote_rolls.get("AYE", {}): for legislator in vote_rolls["AYE"]["items"]: vote.yes(legislator["fullName"]) yes_count += 1 if "items" in vote_rolls.get("AYEWR", {}): for legislator in vote_rolls["AYEWR"]["items"]: vote.yes(legislator["fullName"]) yes_count += 1 # Count all nay votes. if "items" in vote_rolls.get("NAY", {}): for legislator in vote_rolls["NAY"]["items"]: vote.no(legislator["fullName"]) no_count += 1 # Count all other types of votes. other_vote_types = ("EXC", "ABS", "ABD") for vote_type in other_vote_types: if vote_rolls.get(vote_type, []): for legislator in vote_rolls[vote_type]["items"]: vote.vote("other", legislator["fullName"]) other_count += 1 vote.result = "pass" if yes_count > no_count else "fail" vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) return vote
def scrape_votes_old(self, bill, billname, session): vote_url = ("http://archives.legislature.state.oh.us/bills.cfm?ID=" + session + "_" + billname) page = self.get(vote_url).text page = lxml.html.fromstring(page) for jlink in page.xpath("//a[contains(@href, 'JournalText')]"): date = self._tz.localize( datetime.datetime.strptime(jlink.text, "%m/%d/%Y")).date() date = "{:%Y-%m-%d}".format(date) details = jlink.xpath("string(../../../td[2])") chamber = details.split(" - ")[0] if chamber == "House": chamber = "lower" elif chamber == "Senate": chamber = "upper" else: raise ScrapeError("Bad chamber: %s" % chamber) motion = details.split(" - ")[1].split("\n")[0].strip() vote_row = jlink.xpath("../../..")[0].getnext() yea_div = vote_row.xpath("td/font/div[contains(@id, 'Yea')]")[0] yeas = [] for td in yea_div.xpath("table/tr/td"): name = td.xpath("string()") if name: yeas.append(name) no_div = vote_row.xpath("td/font/div[contains(@id, 'Nay')]")[0] nays = [] for td in no_div.xpath("table/tr/td"): name = td.xpath("string()") if name: nays.append(name) yes_count = len(yeas) no_count = len(nays) vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result="pass" if yes_count > no_count else "fail", bill=bill, classification="passage", ) for yes in yeas: vote.yes(yes) for no in nays: vote.no(no) vote.add_source(vote_url) yield vote
def test_full_vote_event(): j = create_jurisdiction() j.legislative_sessions.create(name="1900", identifier="1900") sp1 = ScrapePerson("John Smith", primary_org="lower") sp2 = ScrapePerson("Adam Smith", primary_org="lower") org = ScrapeOrganization(name="House", classification="lower") bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", from_organization=org._id) vote_event = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-01", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", organization=org._id, ) vote_event.set_count("yes", 20) vote_event.yes("John Smith") vote_event.no("Adam Smith") oi = OrganizationImporter("jid") oi.import_data([org.as_dict()]) pi = PersonImporter("jid") pi.import_data([sp1.as_dict(), sp2.as_dict()]) mi = MembershipImporter("jid", pi, oi, DumbMockImporter()) mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()]) bi = BillImporter("jid", oi, pi) bi.import_data([bill.as_dict()]) VoteEventImporter("jid", pi, oi, bi).import_data([vote_event.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session == LegislativeSession.objects.get() assert ve.motion_classification == ["passage:bill"] assert ve.bill == Bill.objects.get() count = ve.counts.get() assert count.option == "yes" assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): if v.voter_name == "John Smith": assert v.option == "yes" assert v.voter == Person.objects.get(name="John Smith") else: assert v.option == "no" assert v.voter == Person.objects.get(name="Adam Smith")
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( r"YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)" r"(.*)ABSENT( OR NOT VOTING)? -?\s?" r"(\d+)(.*)", re.MULTILINE | re.DOTALL, ) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == "upper" or actor == "lower": vote_chamber = actor else: vote_chamber = "" vote = Vote( chamber=vote_chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", identifier=str(uniqid), classification="passage", bill=bill, ) vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) yes_votes = re.split(r"\s{2,}", match.group(2).strip()) no_votes = re.split(r"\s{2,}", match.group(4).strip()) other_votes = re.split(r"\s{2,}", match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.vote("other", other) yield vote
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ( "http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % (bill_num, self.biennium) ) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") seq_no = xpath(rc, "string(wa:SequenceNumber)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {"House": "lower", "Senate": "upper"}[agency] vote = Vote( chamber=chamber, start_date=date, motion_text="{} (#{})".format(motion, seq_no), result="pass" if yes_count > (no_count + other_count) else "fail", bill=bill, classification=[], ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == "Yea": vote.yes(name) elif vtype == "Nay": vote.no(name) else: vote.vote("other", name) yield vote
def parse_vote(self, actor, date, row, session, bill_id, bill_chamber, source): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath(".//span") motion = row.text.replace("\u00a0", " ").replace("-", "").strip() motion = motion if motion else "passage" passed, yes_count, no_count, other_count = ( spans[0].text_content().rsplit("-", 3)) yes_votes = self.get_names(spans[1].tail) no_votes = self.get_names(spans[2].tail) other_votes = [] for span in spans[3:]: if span.text.startswith(("Absent", "Excused")): other_votes += self.get_names(span.tail) for key, val in { "adopted": "pass", "passed": "pass", "failed": "fail" }.items(): if key in passed.lower(): passed = val break vote = VoteEvent( chamber=actor, start_date=date, motion_text=motion, bill=bill_id, bill_chamber=bill_chamber, result=passed, classification="passage", legislative_session=session, ) vote.add_source(source) vote.set_count("yes", int(yes_count)) vote.set_count("no", int(no_count)) vote.set_count("absent", int(other_count)) for name in yes_votes: if name and name != "None": vote.yes(name) for name in no_votes: if name and name != "None": vote.no(name) for name in other_votes: if name and name != "None": vote.vote("absent", name) yield vote
def scrape_vote(self, chamber, session, bill_id, vote_url): try: resp = self.get(vote_url) html = resp.text except scrapelib.HTTPError: return doc = lxml.html.fromstring(html) motion = doc.xpath("//p[1]//b[1]/text()")[-1].strip() if len(motion) == 0: print(motion) motion = doc.xpath("//h2[1]/text()")[0].strip() vote_count = ( doc.xpath("//h3[contains(text(),'YEA and ')]/text()")[0].strip().split() ) yeas = int(vote_count[0]) nays = int(vote_count[3]) date = doc.xpath("//b[contains(text(),'Date:')]/../text()")[1].strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vote = VoteEvent( chamber="lower", start_date=date, motion_text=motion, result="pass" if yeas > nays else "fail", classification="passage", legislative_session=session, bill=bill_id, bill_chamber=chamber, ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.add_source(vote_url) vote.pupa_id = vote_url # first table has YEAs for name in doc.xpath("//table[1]//font/text()"): vote.yes(name.strip()) # second table is nays for name in doc.xpath("//table[2]//font/text()"): vote.no(name.strip()) yield vote
def test_full_vote_event(): create_jurisdiction() bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", chamber="lower") vote_event = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-01", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", chamber="lower", ) vote_event.set_count("yes", 20) vote_event.yes("John Smith") vote_event.no("Adam Smith") Person.objects.create(name="John Smith") Person.objects.create(name="Adam Smith") for person in Person.objects.all(): person.memberships.create(organization=Organization.objects.get( classification="lower")) bi = BillImporter("jid") bi.import_data([bill.as_dict()]) VoteEventImporter("jid", bi).import_data([vote_event.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session == LegislativeSession.objects.get() assert ve.motion_classification == ["passage:bill"] assert ve.bill == Bill.objects.get() count = ve.counts.get() assert count.option == "yes" assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): if v.voter_name == "John Smith": assert v.option == "yes" assert v.voter == Person.objects.get(name="John Smith") else: assert v.option == "no" assert v.voter == Person.objects.get(name="Adam Smith")
def asvote(self): v = VoteEvent( chamber=self.chamber(), start_date=self.date(), motion_text=self.motion(), result="pass" if self.passed() else "fail", classification="passage", bill=self.bill, ) v.dedupe_key = self.url # URL contains sequence number v.set_count("yes", self.yes_count()) v.set_count("no", self.no_count()) v.set_count("other", self.other_count()) for voter in self.yes_votes(): v.yes(voter) for voter in self.no_votes(): v.no(voter) for voter in self.other_votes(): v.vote("other", voter) v.add_source(self.url) return v
def handle_page(self): # Checks to see if any vote totals are provided if (len( self.doc.xpath( '//span[contains(@id, "ctl00_MainContent_lblTotal")]/text()' )) > 0): (date, ) = self.doc.xpath('//span[contains(@id, "lblDate")]/text()') date = format_datetime( datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p"), "US/Eastern") # ctl00_MainContent_lblTotal //span[contains(@id, "ctl00_MainContent_lblTotal")] yes_count = int( self.doc.xpath('//span[contains(@id, "lblYeas")]/text()')[0]) no_count = int( self.doc.xpath('//span[contains(@id, "lblNays")]/text()')[0]) other_count = int( self.doc.xpath('//span[contains(@id, "lblMissed")]/text()')[0]) result = "pass" if yes_count > no_count else "fail" (committee, ) = self.doc.xpath('//span[contains(@id, "lblCommittee")]/text()') (action, ) = self.doc.xpath('//span[contains(@id, "lblAction")]/text()') motion = "{} ({})".format(action, committee) vote = VoteEvent( start_date=date, bill=self.kwargs["bill"], chamber="lower", motion_text=motion, result=result, classification="committee", ) vote.add_source(self.url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", other_count) for member_vote in self.doc.xpath( '//ul[contains(@class, "vote-list")]/li'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath("span[2]//text()") (member_vote, ) = member_vote.xpath("span[1]//text()") member = member.strip() if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.vote("not voting", member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r"\([YN]\)", member_vote): continue else: raise ValueError( "Unknown vote type found: {}".format(member_vote)) yield vote
def scrape(self, session=None): if not session: session = self.latest_session() self.info("no session specified, using %s", session) for category in self._categories: leg_listing_url = ( self._API_BASE_URL + f"BulkData/{category['categoryId']}/{session}" ) resp = requests.post(leg_listing_url, headers=self._headers, verify=False,) resp.raise_for_status() leg_listing = resp.json() for leg in leg_listing: bill = Bill( leg["legislationNumber"], legislative_session=session, title=leg["title"], classification=category["name"], ) bill.add_source(leg_listing_url) bill_url = ( f"https://lims.dccouncil.us/Legislation/{leg['legislationNumber']}" ) bill.add_source(bill_url) if leg['lawNumber']: bill.extras['lawNumber'] = leg['lawNumber'] # Actions for hist in leg["legislationHistory"]: hist_date = datetime.datetime.strptime( hist["actionDate"], "%b %d, %Y" ) hist_date = self._TZ.localize(hist_date) hist_action = hist["actionDescription"] if hist_action.split()[0] in ["OtherAmendment", "OtherMotion"]: hist_action = hist_action[5:] hist_class = self.classify_action(hist_action) if "mayor" in hist_action.lower(): actor = "executive" else: actor = "legislature" bill.add_action( hist_action, hist_date, classification=hist_class, chamber=actor ) # Documents with download links if hist["downloadURL"] and ("download" in hist["downloadURL"]): download = hist["downloadURL"] if not download.startswith("http"): download = "https://lims.dccouncil.us/" + download mimetype = ( "application/pdf" if download.endswith("pdf") else None ) is_version = False # figure out if it's a version from type/name possible_version_types = [ "SignedAct", "Introduction", "Enrollment", "Engrossment", ] for vt in possible_version_types: if vt.lower() in download.lower(): is_version = True doc_type = vt if "amendment" in download.lower(): doc_type = "Amendment" if is_version: bill.add_version_link( doc_type, download, media_type=mimetype, on_duplicate="ignore", ) else: bill.add_document_link( hist["actionDescription"], download, media_type=mimetype, on_duplicate="ignore", ) # Grabs Legislation details leg_details_url = ( self._API_BASE_URL + f"LegislationDetails/{leg['legislationNumber']}" ) details_resp = requests.get( leg_details_url, headers=self._headers, verify=False, ) details_resp.raise_for_status() leg_details = details_resp.json() # Sponsors for i in leg_details["introducers"]: name = i["memberName"] bill.add_sponsorship( name, classification="primary", entity_type="person", primary=True, ) # Co-sponsor if leg_details["coSponsors"]: for cs in leg_details["coSponsors"]: name = i["memberName"] bill.add_sponsorship( name, classification="cosponsor", entity_type="person", primary=True, ) # Committee Hearing Doc for commHearing in leg_details["committeeHearing"]: if commHearing["hearingRecord"]: bill.add_document_link( commHearing["hearingType"], commHearing["hearingRecord"], media_type="application/pdf", on_duplicate="ignore", ) for committeeMarkup in leg_details["committeeMarkup"]: if committeeMarkup["committeeReport"]: bill.add_document_link( "Committee Markup", committeeMarkup["committeeReport"], media_type="application/pdf", on_duplicate="ignore", ) # Actions and Votes if leg_details["actions"]: # To prevent duplicate votes vote_ids = [] for act in leg_details["actions"]: action_name = act["action"] action_date = datetime.datetime.strptime( act["actionDate"][:10], "%Y-%m-%d" ) action_date = self._TZ.localize(action_date) if action_name.split()[0] == "Other": action_name = " ".join(action_name.split()[1:]) if "mayor" in action_name.lower(): actor = "executive" else: actor = "legislature" # Documents and Versions if act["attachment"]: mimetype = ( "application/pdf" if act["attachment"].endswith("pdf") else None ) is_version = False # figure out if it's a version from type/name possible_version_types = [ "SignedAct", "Introduction", "Enrollment", "Engrossment", ] for vt in possible_version_types: if vt.lower() in act["attachment"].lower(): is_version = True doc_type = vt if "amendment" in act["attachment"].lower(): doc_type = "Amendment" if is_version: bill.add_version_link( doc_type, act["attachment"], media_type=mimetype, on_duplicate="ignore", ) else: bill.add_document_link( doc_type, act["attachment"], media_type=mimetype, on_duplicate="ignore", ) # Votes if act["voteDetails"]: result = act["voteDetails"]["voteResult"] if result: status = self._vote_statuses[result.lower()] id_text = ( str(leg["legislationNumber"]) + "-" + action_name + "-" + result ) if id_text not in vote_ids: vote_ids.append(id_text) action_class = self.classify_action(action_name) v = VoteEvent( identifier=id_text, chamber=actor, start_date=action_date, motion_text=action_name, result=status, classification=action_class, bill=bill, ) v.add_source(leg_listing_url) yes_count = ( no_count ) = absent_count = abstain_count = other_count = 0 for leg_vote in act["voteDetails"]["votes"]: mem_name = leg_vote["councilMember"] if leg_vote["vote"] == "Yes": yes_count += 1 v.yes(mem_name) elif leg_vote["vote"] == "No": no_count += 1 v.no(mem_name) elif leg_vote["vote"] == "Absent": absent_count += 1 v.vote("absent", mem_name) elif leg_vote["vote"] == "Recused": v.vote("abstain", mem_name) abstain_count += 1 elif leg_vote["vote"] == "Present": v.vote("other", mem_name) other_count += 1 else: # Incase anything new pops up other_count += 1 v.vote("other", mem_name) v.set_count("yes", yes_count) v.set_count("no", no_count) v.set_count("absent", absent_count) v.set_count("abstain", abstain_count) v.set_count("other", other_count) yield v yield bill
def scrape(self, session=None): HTML_TAGS_RE = r"<.*?>" if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) # Load all bills and resolutions via the private API bills_url = "http://legislature.vermont.gov/bill/loadBillsReleased/{}/".format( year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)["data"] or [] bills_url = "http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/".format( year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)["data"] or []) resolutions_url = "http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both".format( year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)["data"] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Identify the bill type and chamber if info["BillNumber"].startswith("J.R.H."): bill_type = "joint resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("J.R.S."): bill_type = "joint resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("H.C.R."): bill_type = "concurrent resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("S.C.R."): bill_type = "concurrent resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("H.R."): bill_type = "resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("S.R."): bill_type = "resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("PR."): bill_type = "constitutional amendment" if info["Body"] == "H": bill_chamber = "lower" elif info["Body"] == "S": bill_chamber = "upper" else: raise AssertionError("Amendment not tied to chamber") elif info["BillNumber"].startswith("H."): bill_type = "bill" bill_chamber = "lower" elif info["BillNumber"].startswith("S."): bill_type = "bill" bill_chamber = "upper" else: raise AssertionError("Unknown bill type found: '{}'".format( info["BillNumber"])) bill_id_original_format = (info["BillNumber"].replace(".", "").replace( " ", "")) bill_id = bill_id_original_format # put one space back in between type and number bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id) # Create the bill using its basic information bill = Bill( identifier=bill_id, legislative_session=session, chamber=bill_chamber, title=info["Title"], classification=bill_type, ) if "resolution" in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = "http://legislature.vermont.gov/bill/status/{0}/{1}".format( year_slug, info["BillNumber"]) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' "following-sibling::dd[1]/ul/li") sponsor_type = "primary" for sponsor in sponsors: if sponsor.xpath("span/text()") == ["Additional Sponsors"]: sponsor_type = "cosponsor" continue sponsor_name = (sponsor.xpath("a/text()")[0].replace( "Rep.", "").replace("Sen.", "").strip()) if sponsor_name and not (sponsor_name[:5] == "Less" and len(sponsor_name) == 5): bill.add_sponsorship( name=sponsor_name, classification=sponsor_type, entity_type="person", primary=(sponsor_type == "primary"), ) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' "following-sibling::dd[1]/ul/li/a |" '//ul[@class="bill-path"]//a') for version in versions: if version.xpath("text()"): bill.add_version_link( note=version.xpath("text()")[0], url=version.xpath("@href")[0].replace(" ", "%20"), media_type="application/pdf", ) # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/.+?/(\d+)"', lxml.etree.tostring(doc).decode("utf-8"), ).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".format( info["BillNumber"])) yield bill continue # Capture actions actions_url = "http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}".format( year_slug, internal_bill_id) actions_json = self.get(actions_url) # Checks if page actually has json posted if "json" in actions_json.headers.get("Content-Type"): actions = json.loads(actions_json.text)["data"] # Checks to see if any data is actually there if actions == "": continue else: continue bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v for k, v in action.items() if v is not None} if "Signed by Governor" in action["FullStatus"]: actor = "executive" elif action["ChamberCode"] == "H": actor = "lower" elif action["ChamberCode"] == "S": actor = "upper" else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action["FullStatus"]: # assert chambers_passed == set("HS") action_type = "executive-signature" elif "Vetoed by the Governor" in action["FullStatus"]: action_type = "executive-veto" elif ("Read first time" in action["FullStatus"] or "Read 1st time" in action["FullStatus"]): action_type = "introduction" elif "Reported favorably" in action["FullStatus"]: action_type = "committee-passage-favorable" elif actor == "lower" and any( x.lower().startswith("aspassed") for x in action["keywords"].split(";")): action_type = "passage" chambers_passed.add("H") elif actor == "upper" and any( x.lower().startswith(" aspassed") or x.lower().startswith("aspassed") for x in action["keywords"].split(";")): action_type = "passage" chambers_passed.add("S") else: action_type = None # Manual fix for data error in # https://legislature.vermont.gov/bill/status/2020/H.511 action["StatusDate"] = action["StatusDate"].replace( "/0209", "/2019") # Manual fix for data error in # https://legislature.vermont.gov/bill/status/2020/H.754 if bill_id == "H 754" and session == "2019-2020": action["StatusDate"] = action["StatusDate"].replace( "/0202", "/2020") # https://legislature.vermont.gov/bill/status/2020/H.942 if bill_id == "H 942" and session == "2019-2020": action["StatusDate"] = action["StatusDate"].replace( "/0200", "/2020") action_date = datetime.datetime.strftime( datetime.datetime.strptime(action["StatusDate"], "%m/%d/%Y"), "%Y-%m-%d", ) # strftime doesn't always pad year value (%Y) (https://bugs.python.org/issue32195) # and sometimes this state has typos in year part of the StatusDate value # which can cause validation errors, so fix leading zeroes if they are missing if action_date.find("-") < 4: action_date = ("0" * (4 - action_date.find("-"))) + action_date bill.add_action( description=re.sub(HTML_TAGS_RE, "", action["FullStatus"]), date=action_date, chamber=actor, classification=action_type, ) # Capture votes votes_url = "http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}".format( year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)["data"] bill.add_source(votes_url) for vote in votes: roll_call_id = vote["VoteHeaderID"] roll_call_url = ("http://legislature.vermont.gov/bill/" "loadBillRollCallDetails/{0}/{1}".format( year_slug, roll_call_id)) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)["data"] roll_call_yea = [] roll_call_nay = [] roll_call_not_voting = [] for member in roll_call: (member_name, _district) = member["MemberName"].split(" of ") member_name = member_name.strip() if member["MemberVote"] == "Yea": roll_call_yea.append(member_name) elif member["MemberVote"] == "Nay": roll_call_nay.append(member_name) else: roll_call_not_voting.append(member_name) if ("Passed -- " in vote["FullStatus"] # seems like we've seen both or "Governor overridden" in vote["FullStatus"] or "Governor overriden" in vote["FullStatus"]): did_pass = True elif ("Failed -- " in vote["FullStatus"] or "Veto of the Governor sustained" in vote["FullStatus"]): did_pass = False else: raise AssertionError("Roll call vote result is unclear: " + vote["FullStatus"]) # Check vote counts yea_count = int( re.search(r"Yeas = (\d+)", vote["FullStatus"]).group(1)) nay_count = int( re.search(r"Nays = (\d+)", vote["FullStatus"]).group(1)) vote_start_date = datetime.datetime.strftime( datetime.datetime.strptime(vote["StatusDate"], "%m/%d/%Y"), "%Y-%m-%d", ) motion_text = re.sub(HTML_TAGS_RE, "", vote["FullStatus"]).strip() vote_identifer = (vote["StatusDate"] + "--" + motion_text + "--" + roll_call_url) vote_to_add = VoteEvent( identifier=vote_identifer, bill=bill, chamber=("lower" if vote["ChamberCode"] == "H" else "upper"), start_date=vote_start_date, motion_text=motion_text, result="pass" if did_pass else "fail", classification="passage", legislative_session=session, ) vote_to_add.add_source(roll_call_url) vote_to_add.set_count("yes", yea_count) vote_to_add.set_count("no", nay_count) vote_to_add.set_count("not voting", len(roll_call_not_voting)) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_not_voting: vote_to_add.vote("not voting", member) yield vote_to_add # Witnesses: # http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} witnesses_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/witnesses".format( bill_id_original_format) bill.add_document_link(note="Witness List", url=witnesses_doc_link_url, media_type="text/html") # Conference committee members: # http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} conferees_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/conference".format( bill_id_original_format) page = self.lxmlize(conferees_doc_link_url) no_data = page.xpath('//div[@class="no-data"]/text()') if not no_data: bill.add_document_link( note="Conference Committee Members", url=conferees_doc_link_url, media_type="text/html", ) # Committee meetings: # http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} meetings_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/meetings".format( bill_id_original_format) bill.add_document_link( note="Committee Meetings", url=meetings_doc_link_url, media_type="text/html", ) yield bill
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " ")) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={"re": re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if "HOUSE" in header.xpath("string()"): chamber = "lower" motion_index = 8 else: chamber = "upper" motion_index = 13 motion = header.xpath("string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r"\s+", " ", motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r"^(.*) (PASSED|FAILED)$", motion) if match: motion = match.group(1) passed = match.group(2) == "PASSED" else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ") rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r"\d+/\d+/\d+", date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace("\r\n", " ").strip() if "*****" in line: break regex = (r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL " r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)") match = re.match(regex, line) if match: if match.group(1) == "YEAS" and "RCS#" not in line: vtype = "yes" seen_yes = True elif match.group(1) == "NAYS" and seen_yes: vtype = "no" elif match.group(1) == "VACANT": continue # skip these elif seen_yes: vtype = "other" if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(" "): if not name: continue if "HOUSE" in name or "SENATE " in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts["yes"] > (counts["no"] + counts["other"]) vote = Vote( chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.set_count("yes", counts["yes"]) vote.set_count("no", counts["no"]) vote.set_count("other", counts["other"]) vote.dedupe_key = url + "#" + rcs vote.add_source(url) for name in votes["yes"]: vote.yes(name) for name in votes["no"]: if ":" in name: raise Exception(name) vote.no(name) for name in votes["other"]: vote.vote("other", name) yield vote
def scrape_vote(self, bill, vote_json, session): if vote_json["amendmentNumber"]: motion = "{}: {}".format(vote_json["amendmentNumber"], vote_json["action"]) else: motion = vote_json["action"] result = ( "pass" if vote_json["yesVotesCount"] > vote_json["noVotesCount"] else "fail" ) v = VoteEvent( chamber=self.chamber_abbrev_map[vote_json["chamber"]], start_date=self.parse_local_date(vote_json["voteDate"]), motion_text=motion, result=result, legislative_session=session, bill=bill, classification="other", ) v.set_count(option="yes", value=vote_json["yesVotesCount"]) v.set_count("no", vote_json["noVotesCount"]) v.set_count("absent", vote_json["absentVotesCount"]) v.set_count("excused", vote_json["excusedVotesCount"]) v.set_count("other", vote_json["conflictVotesCount"]) for name in vote_json["yesVotes"].split(","): if name: name = name.strip() v.yes(name) for name in vote_json["noVotes"].split(","): if name: name = name.strip() v.no(name) # add votes with other classifications # option can be 'yes', 'no', 'absent', # 'abstain', 'not voting', 'paired', 'excused' for name in vote_json["absentVotes"].split(","): if name: name = name.strip() v.vote(option="absent", voter=name) for name in vote_json["excusedVotes"].split(","): if name: name = name.strip() v.vote(option="excused", voter=name) for name in vote_json["conflictVotes"].split(","): if name: name = name.strip() v.vote(option="other", voter=name) source_url = "http://lso.wyoleg.gov/Legislation/{}/{}".format( session, vote_json["billNumber"] ) v.add_source(source_url) yield v
def scrape_vote(self, bill, vote_id, session): vote_url = ( "https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId" ) form = {"rollCallId": vote_id, "sort": "", "group": "", "filter": ""} self.info("Fetching vote {} for {}".format(vote_id, bill.identifier)) page = self.post(url=vote_url, data=form, allow_redirects=True).json() if page: roll = page["Model"] vote_chamber = self.chamber_map[roll["ChamberName"]] # "7/1/16 01:00 AM" vote_date = dt.datetime.strptime( roll["TakenAtDateTime"], "%m/%d/%y %I:%M %p").strftime("%Y-%m-%d") # TODO: What does this code mean? vote_motion = roll["RollCallVoteType"] vote_passed = "pass" if roll[ "RollCallStatus"] == "Passed" else "fail" other_count = (int(roll["NotVotingCount"]) + int(roll["VacantVoteCount"]) + int(roll["AbsentVoteCount"]) + int(roll["ConflictVoteCount"])) vote = VoteEvent( chamber=vote_chamber, start_date=vote_date, motion_text=vote_motion, result=vote_passed, bill=bill, legislative_session=session, classification=[], ) vote_pdf_url = ("https://legis.delaware.gov" "/json/RollCallController/GenerateRollCallPdf" "?rollCallId={}&chamberId={}".format( vote_id, self.chamber_codes[vote_chamber])) # Vote URL is just a generic search URL with POSTed data, # so provide a different link vote.add_source(vote_pdf_url) vote.dedupe_key = vote_pdf_url vote.set_count("yes", roll["YesVoteCount"]) vote.set_count("no", roll["NoVoteCount"]) vote.set_count("other", other_count) for row in roll["AssemblyMemberVotes"]: # AssemblyMemberId looks like it should work here, # but for some sessions it's bugged to only return session try: voter = self.legislators_by_short[str(row["ShortName"])] name = voter["DisplayName"] except KeyError: self.warning("could not find legislator short name %s", row["ShortName"]) name = row["ShortName"] if row["SelectVoteTypeCode"] == "Y": vote.yes(name) elif row["SelectVoteTypeCode"] == "N": vote.no(name) else: vote.vote("other", name) yield vote
def scrape_vote(self, bill, date, url): page = self.get(url).text page = lxml.html.fromstring(page) header = page.xpath("string(//h3[contains(@id, 'hdVote')])") if "No Bill Action" in header: self.warning("bad vote header -- skipping") return location = header.split(", ")[1] if location.startswith("House"): chamber = "lower" elif location.startswith("Senate"): chamber = "upper" elif location.startswith("Joint"): chamber = "legislature" else: raise ScrapeError("Bad chamber: %s" % location) motion = ", ".join(header.split(", ")[2:]).strip() if motion: # If we can't detect a motion, skip this vote yes_count = int(page.xpath("string(//span[contains(@id, 'tdAyes')])")) no_count = int(page.xpath("string(//span[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//span[contains(@id, 'tdExcused')])") ) absent_count = int(page.xpath("string(//span[contains(@id, 'tdAbsent')])")) passed = yes_count > no_count if motion.startswith("Do Pass"): type = "passage" elif motion == "Concurred in amendments": type = "amendment" elif motion == "Veto override": type = "veto_override" else: type = "other" vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", classification=type, bill=bill, ) # The vote page URL has a unique ID # However, some votes are "consent calendar" events, # and relate to the passage of _multiple_ bills # These can't be modeled yet in Pupa, but for now we can # append a bill ID to the URL that forms the `pupa_id` # https://github.com/opencivicdata/pupa/issues/308 vote.pupa_id = "{}#{}".format(url, bill.identifier.replace(" ", "")) vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"): option_or_person = td.text.strip() if option_or_person in ("Aye", "Yea"): vote.yes(td.getprevious().text.strip()) elif option_or_person == "Nay": vote.no(td.getprevious().text.strip()) elif option_or_person == "Excused": vote.vote("excused", td.getprevious().text.strip()) elif option_or_person == "Absent": vote.vote("absent", td.getprevious().text.strip()) yield vote
def scrape_votes_for_chamber(self, chamber, vote_data, bill, link): raw_vote_data = re.split(r"\w+? by [\w ]+?\s+-", vote_data.strip())[1:] motion_count = 1 for raw_vote in raw_vote_data: raw_vote = raw_vote.split( u"\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0") motion = raw_vote[0] if len(raw_vote) < 2: continue vote_date = re.search(r"(\d+/\d+/\d+)", motion) if vote_date: vote_date = datetime.datetime.strptime(vote_date.group(), "%m/%d/%Y") passed = ("Passed" in motion or "Recommended for passage" in motion or "Rec. for pass" in motion or "Adopted" in raw_vote[1]) vote_regex = re.compile(r"\d+$") aye_regex = re.compile(r"^.+voting aye were: (.+) -") no_regex = re.compile(r"^.+voting no were: (.+) -") not_voting_regex = re.compile( r"^.+present and not voting were: (.+) -") yes_count = 0 no_count = 0 not_voting_count = 0 ayes = [] nos = [] not_voting = [] for v in raw_vote[1:]: v = v.strip() if v.startswith("Ayes...") and vote_regex.search(v): yes_count = int(vote_regex.search(v).group()) elif v.startswith("Noes...") and vote_regex.search(v): no_count = int(vote_regex.search(v).group()) elif v.startswith( "Present and not voting...") and vote_regex.search(v): not_voting_count += int(vote_regex.search(v).group()) elif aye_regex.search(v): ayes = aye_regex.search(v).groups()[0].split(", ") elif no_regex.search(v): nos = no_regex.search(v).groups()[0].split(", ") elif not_voting_regex.search(v): not_voting += not_voting_regex.search(v).groups()[0].split( ", ") motion = motion.strip() motion = motion.replace("&", "&") # un-escape ampersands if motion in self._seen_votes: motion = "{} ({})".format(motion, motion_count) motion_count += 1 self._seen_votes.add(motion) vote = VoteEvent( motion_text=motion, start_date=vote_date.strftime("%Y-%m-%d") if vote_date else None, classification="passage", result="pass" if passed else "fail", chamber=chamber, bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", not_voting_count) vote.add_source(link) seen = set() for a in ayes: if a in seen: continue vote.yes(a) seen.add(a) for n in nos: if n in seen: continue vote.no(n) seen.add(n) for n in not_voting: if n in seen: continue vote.vote("not voting", n) seen.add(n) yield vote
def scrape_vote(self, bill, name, url): if "VOTE/h" in url: vote_chamber = "lower" cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = "upper" cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 page = self.get(url, verify=False).text if "BUDGET ADDRESS" in page: return page = lxml.html.fromstring(page) yes_count = page.xpath("string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r"[^\d]*(\d+)[^\d]*", yes_count).group(1)) no_count = page.xpath("string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r"[^\d]*(\d+)[^\d]*", no_count).group(1)) other_count = page.xpath("string(//span[contains(., 'Those absent')])") other_count = int(re.match(r"[^\d]*(\d+)[^\d]*", other_count).group(1)) need_count = page.xpath("string(//span[contains(., 'Necessary for')])") need_count = int(re.match(r"[^\d]*(\d+)[^\d]*", need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r".*Taken\s+on\s+(\d+/\s?\d+)", date).group(1) date = date.replace(" ", "") date = datetime.datetime.strptime( date + " " + bill.legislative_session, "%m/%d %Y" ).date() # not sure about classification. vote = Vote( chamber=vote_chamber, start_date=date, motion_text=name, result="pass" if yes_count > need_count else "fail", classification="passage", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % (i + name_offset)).strip() if not name or name == "VACANT": continue name = string.capwords(name) if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.vote("other", name) yield vote
def process_vote(self, vote, bill, member_ids): try: motion = vote["ReadingDescription"] except KeyError: self.logger.warning( "Can't even figure out what we're voting on. Skipping.") return if "VoteResult" not in vote: if "postponed" in motion.lower(): result = "Postponed" status = ( "pass" # because we're talking abtout the motion, not the amendment ) elif "tabled" in motion.lower(): result = "Tabled" status = "pass" else: self.logger.warning("Could not find result of vote, skipping.") return else: result = vote["VoteResult"].strip().lower() statuses = { "approved": "pass", "disapproved": "fail", "failed": "fail", "declined": "fail", "passed": "pass", } try: status = statuses[result] except KeyError: self.logger.warning( "Unexpected vote result '{result},' skipping vote.".format( result=result)) return date = self.date_format(vote["DateOfVote"]) leg_votes = vote["MemberVotes"] v = VoteEvent( chamber="legislature", start_date=date, motion_text=motion, result=status, classification="passage", bill=bill, ) yes_count = no_count = other_count = 0 for leg_vote in leg_votes: mem_name = member_ids[int(leg_vote["MemberId"])] if leg_vote["Vote"] == "1": yes_count += 1 v.yes(mem_name) elif leg_vote["Vote"] == "2": no_count += 1 v.no(mem_name) else: other_count += 1 v.vote("other", mem_name) v.set_count("yes", yes_count) v.set_count("no", no_count) v.set_count("other", other_count) # the documents for the readings are inside the vote # level in the json, so we'll deal with them here # and also add relevant actions if "amendment" in motion.lower(): if status: t = "amendment-passage" elif result in ["Tabled", "Postponed"]: t = "amendment-deferral" else: t = "amendment-failure" elif "first reading" in motion.lower(): t = "reading-1" elif "1st reading" in motion.lower(): t = "reading-1" elif "second reading" in motion.lower(): t = "reading-2" elif "2nd reading" in motion.lower(): t = "reading-2" elif "third reading" in motion.lower(): t = "reading-3" elif "3rd reading" in motion.lower(): t = "reading-3" elif "final reading" in motion.lower(): t = "reading-3" elif result in ["Tabled", "Postponed"]: t = None else: t = None if t: if "amendment" in t: vote["type"] = "amendment" elif "reading" in t: vote["type"] = t.replace("bill:", "") # some documents/versions are hiding in votes. if "AttachmentPath" in vote: is_version = False try: if vote["DocumentType"] in [ "enrollment", "engrossment", "introduction", ]: is_version = True except KeyError: pass if motion in ["enrollment", "engrossment", "introduction"]: is_version = True self.add_documents(vote["AttachmentPath"], bill, is_version) return v
def parse_bill_actions_table(self, bill, action_table, bill_id, session, url, bill_chamber): # vote types that have been reconsidered since last vote of that type reconsiderations = set() for action in action_table.xpath("*")[1:]: date = action[0].text_content() date = dt.datetime.strptime(date, "%m/%d/%Y").strftime("%Y-%m-%d") actor_code = action[1].text_content().upper() string = action[2].text_content() actor = self._vote_type_map[actor_code] act_type, committees = categorize_action(string) # XXX: Translate short-code to full committee name for the # matcher. real_committees = [] if committees: for committee in committees: try: committee = self.short_ids[committee]["name"] real_committees.append(committee) except KeyError: pass act = bill.add_action(string, date, chamber=actor, classification=act_type) for committee in real_committees: act.add_related_entity(name=committee, entity_type="organization") vote = self.parse_vote(string) if vote: v, motion = vote motion_text = (("Reconsider: " + motion) if actor in reconsiderations else motion) vote = VoteEvent( start_date=date, chamber=actor, bill=bill_id, bill_chamber=bill_chamber, legislative_session=session, motion_text=motion_text, result="pass" if "passed" in string.lower() else "fail", classification="passage", ) reconsiderations.discard(actor) vote.add_source(url) vote.set_count("yes", int(v["n_yes"] or 0)) vote.set_count("no", int(v["n_no"] or 0)) vote.set_count("not voting", int(v["n_excused"] or 0)) for voter in split_specific_votes(v["yes"]): voter = self.clean_voter_name(voter) vote.yes(voter) for voter in split_specific_votes(v["yes_resv"]): voter = self.clean_voter_name(voter) vote.yes(voter) for voter in split_specific_votes(v["no"]): voter = self.clean_voter_name(voter) vote.no(voter) for voter in split_specific_votes(v["excused"]): voter = self.clean_voter_name(voter) vote.vote("not voting", voter) yield vote elif re.search("reconsider", string, re.IGNORECASE): reconsiderations.add(actor)
def parse_html_vote(self, bill, actor, date, motion, url, uniqid): try: page = self.get(url).text except scrapelib.HTTPError: self.warning("A vote page not found for bill {}".format( bill.identifier)) return page = lxml.html.fromstring(page) page.make_links_absolute(url) descr = page.xpath("//b")[0].text_content() if descr == "": # New page method descr = page.xpath("//center")[0].text if "on voice vote" in descr: return if "committee" in descr.lower(): yield from self.scrape_committee_vote(bill, actor, date, motion, page, url, uniqid) return passed = None if "Passed" in descr: passed = True elif "Failed" in descr: passed = False elif "UTAH STATE LEGISLATURE" in descr: return elif descr.strip() == "-": return else: self.warning(descr) raise NotImplementedError("Can't see if we passed or failed") headings = page.xpath("//b")[1:] votes = page.xpath("//table") sets = zip(headings, votes) vdict = {} for (typ, votes) in sets: txt = typ.text_content() arr = [x.strip() for x in txt.split("-", 1)] if len(arr) != 2: continue v_txt, count = arr v_txt = v_txt.strip() count = int(count) people = [ x.text_content().strip() for x in votes.xpath(".//font[@face='Arial']") ] vdict[v_txt] = {"count": count, "people": people} vote = Vote( chamber=actor, start_date=date, motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", identifier=str(uniqid), ) vote.set_count("yes", vdict["Yeas"]["count"]) vote.set_count("no", vdict["Nays"]["count"]) vote.set_count("other", vdict["Absent or not voting"]["count"]) vote.add_source(url) for person in vdict["Yeas"]["people"]: vote.yes(person) for person in vdict["Nays"]["people"]: vote.no(person) for person in vdict["Absent or not voting"]["people"]: vote.vote("other", person) yield vote
def scrape_pdf_for_votes(self, session, actor, date, motion, href): warned = False # vote indicator, a few spaces, a name, newline or multiple spaces # VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})') COUNT_RE = re.compile( r"^(\d+)\s+YEAS?\s+(\d+)\s+NAYS?\s+(\d+)\s+PRESENT(?:\s+(\d+)\s+NOT\sVOTING)?\s*$" ) PASS_FAIL_WORDS = { "PASSED": "pass", "PREVAILED": "fail", "ADOPTED": "pass", "CONCURRED": "pass", "FAILED": "fail", "LOST": "fail", } pdflines = self.fetch_pdf_lines(href) if not pdflines: return False yes_count = no_count = present_count = 0 yes_votes = [] no_votes = [] present_votes = [] excused_votes = [] not_voting = [] absent_votes = [] passed = None counts_found = False vote_lines = [] for line in pdflines: # consider pass/fail as a document property instead of a result of the vote count # extract the vote count from the document instead of just using counts of names if not line.strip(): continue elif line.strip() in PASS_FAIL_WORDS: # Crash on duplicate pass/fail status that differs from previous status if passed is not None and passed != PASS_FAIL_WORDS[line.strip()]: raise Exception("Duplicate pass/fail matches in [%s]" % href) passed = PASS_FAIL_WORDS[line.strip()] elif COUNT_RE.match(line): (yes_count, no_count, present_count, not_voting_count) = COUNT_RE.match( line ).groups() yes_count = int(yes_count) no_count = int(no_count) present_count = int(present_count) counts_found = True elif counts_found: for value in VOTE_VALUES: if re.search(r"^\s*({})\s+\w".format(value), line): vote_lines.append(line) break votes = find_columns_and_parse(vote_lines) for name, vcode in votes.items(): if name == "Mr. Speaker": name = session_details[session]["speaker"] elif name == "Mr. President": name = session_details[session]["president"] else: # Converts "Davis,William" to "Davis, William". name = re.sub(r"\,([a-zA-Z])", r", \1", name) if vcode == "Y": yes_votes.append(name) elif vcode == "N": no_votes.append(name) elif vcode == "P": present_votes.append(name) elif vcode == "E": excused_votes.append(name) elif vcode == "NV": not_voting.append(name) elif vcode == "A": absent_votes.append(name) # fake the counts if yes_count == 0 and no_count == 0 and present_count == 0: yes_count = len(yes_votes) no_count = len(no_votes) else: # audit if yes_count != len(yes_votes): self.warning( "Mismatched yes count [expect: %i] [have: %i]" % (yes_count, len(yes_votes)) ) warned = True if no_count != len(no_votes): self.warning( "Mismatched no count [expect: %i] [have: %i]" % (no_count, len(no_votes)) ) warned = True if passed is None: if actor["classification"] == "lower": # senate doesn't have these lines self.warning( "No pass/fail word found; fall back to comparing yes and no vote." ) warned = True passed = "pass" if yes_count > no_count else "fail" classification, _ = _categorize_action(motion) vote_event = VoteEvent( legislative_session=session, motion_text=motion, classification=classification, organization=actor, start_date=date, result=passed, ) for name in yes_votes: vote_event.yes(name) for name in no_votes: vote_event.no(name) for name in present_votes: vote_event.vote("other", name) for name in excused_votes: vote_event.vote("excused", name) for name in not_voting: vote_event.vote("not voting", name) for name in absent_votes: vote_event.vote("absent", name) vote_event.set_count("yes", yes_count) vote_event.set_count("no", no_count) vote_event.set_count("other", present_count) vote_event.set_count("excused", len(excused_votes)) vote_event.set_count("absent", len(absent_votes)) vote_event.set_count("not voting", len(not_voting)) vote_event.add_source(href) # for distinguishing between votes with the same id and on same day vote_event.dedupe_key = href if warned: self.warning("Warnings were issued. Best to check %s" % href) return vote_event
def process_vote(self, votes, url, base_url, bill, legislators, chamber_dict, vote_results): for v in votes["items"]: try: v["yeas"] except KeyError: # sometimes the actual vote is buried a second layer deep v = self.get(base_url + v["link"]).json() try: v["yeas"] except KeyError: self.logger.warning("No vote info available, skipping") continue try: chamber = chamber_dict[v["chamber"]] except KeyError: chamber = "lower" if "house" in v["apn"] else "upper" try: date = self._tz.localize( datetime.datetime.strptime(v["date"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: try: date = self._tz.localize( datetime.datetime.strptime(v["occurred"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: self.logger.warning("No date found for vote, skipping") continue try: motion = v["action"] except KeyError: motion = v["motiontype"] if motion in self._vote_motion_dict: motion_text = self._vote_motion_dict[motion] else: self.warning( "Unknown vote code {}, please add to _vote_motion_dict". format(motion)) motion_text = v["results"] # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip if not motion and isinstance(v["yeas"], str) and isinstance( v["nays"], str): waringText = 'Malformed JSON found for vote ("revno" of {}); skipping' self.warning(waringText.format(v["revno"])) continue result = v.get("results") or v.get("passed") if result is None: if len(v["yeas"]) > len(v["nays"]): result = "passed" else: result = "failed" passed = vote_results[result.lower()] if "committee" in v: vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion_text, result="pass" if passed else "fail", # organization=v["committee"], bill=bill, classification="committee-passage", ) else: vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion_text, result="pass" if passed else "fail", classification="passage", bill=bill, ) # Concatenate the bill identifier and vote identifier to avoid collisions vote.dedupe_key = "{}:{}".format(bill.identifier.replace(" ", ""), v["revno"]) # the yea and nay counts are not displayed, but vote totals are # and passage status is. yes_count = 0 no_count = 0 absent_count = 0 excused_count = 0 for voter_id in v["yeas"]: vote.yes(legislators[voter_id]) yes_count += 1 for voter_id in v["nays"]: vote.no(legislators[voter_id]) no_count += 1 if "absent" in v: for voter_id in v["absent"]: vote.vote("absent", legislators[voter_id]) absent_count += 1 if "excused" in v: for voter_id in v["excused"]: vote.vote("excused", legislators[voter_id]) excused_count += 1 vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("absent", absent_count) vote.set_count("excused", excused_count) # check to see if there are any other things that look # like vote categories, throw a warning if so for key, val in v.items(): if (type(val) == list and len(val) > 0 and key not in ["yeas", "nays", "absent", "excused"]): if val[0] in legislators: self.logger.warning( "{k} looks like a vote type that's not being counted." " Double check it?".format(k=key)) vote.add_source(url) yield vote
def scrape_vote(self, bill, date, url): page = self.get(url).json() location = page["actionLog"]["FullName"] if location: if "House" in location: chamber = "lower" elif "Senate" in location: chamber = "upper" elif "Joint" in location: chamber = "legislature" else: self.warning("Bad Vote chamber: '%s', skipping" % location) return else: self.warning("Bad Vote chamber: '%s', skipping" % location) return motion = page["actionLog"]["StatusText"] if motion: # If we can't detect a motion, skip this vote yes_count = page["Yeas"] no_count = page["Nays"] excused_count = page["Excused"] absent_count = page["Absent"] passed = yes_count > no_count if motion.startswith("Do Pass"): vtype = "passage" elif motion == "Concurred in amendments": vtype = "amendment" # commenting out until we add these back to OS-core # elif motion == "Veto override": # vtype = "veto-override" else: vtype = [] vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", classification=vtype, bill=bill, ) # differentiate nearly identical votes vote.dedupe_key = url vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for person in page["RollCalls"]: option = person["Vote1"] if option in ("Aye", "Yea"): vote.yes(person["UniqueName"]) elif option == "Nay": vote.no(person["UniqueName"]) elif option == "Excused": vote.vote("excused", person["UniqueName"]) elif option == "Absent": vote.vote("absent", person["UniqueName"]) yield vote
def _parse_votes(self, url, vote, bill): """Given a vote url and a vote object, extract the voters and the vote counts from the vote page and update the vote object. """ if url.lower().endswith(".pdf"): try: resp = self.get(url) except HTTPError: # This vote document wasn't found. msg = "No document found at url %r" % url self.logger.warning(msg) return try: v = PDFCommitteeVote(url, resp.content, bill) return v.asvote() except PDFCommitteeVoteParseError: # Warn and skip. self.warning("Could't parse committee vote at %r" % url) return html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # Yes, no, excused, absent. try: vals = doc.xpath("//table")[1].xpath("tr/td/text()") except IndexError: # Most likely was a bogus link lacking vote data. return yes_count, no_count, excused_count, absent_count = map(int, vals) # Get the motion. try: motion = doc.xpath("//br")[-1].tail.strip() except IndexError: # Some of them mysteriously have no motion listed. motion = vote["action"] if not motion: motion = vote["action"] vote["motion"] = motion action = vote["action"] vote_url = vote["vote_url"] vote = VoteEvent( chamber=vote["chamber"], start_date=vote["date"], motion_text=vote["motion"], result="fail", # placeholder classification="passage", bill=bill, bill_action=vote["action"], ) vote.dedupe_key = vote_url # URL contains sequence number vote.add_source(vote_url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for text in doc.xpath("//table")[2].xpath("tr/td/text()"): if not text.strip("\xa0"): continue v, name = filter(None, text.split("\xa0")) # Considering Name is brackets as short name regex = re.compile(r".*?\((.*?)\)") short_name = re.findall(regex, name) if len(short_name) > 0: note = "Short Name: " + short_name[0] else: note = "" # Name without brackets like 'Kary, Douglas' name = re.sub(r"[\(\[].*?[\)\]]", "", name) if v == "Y": vote.yes(name, note=note) elif v == "N": vote.no(name, note=note) elif v == "E": vote.vote("excused", name, note=note) elif v == "A": vote.vote("absent", name, note=note) # code to determine value of `passed` passed = None # some actions take a super majority, so we aren't just # comparing the yeas and nays here. for i in vote_passage_indicators: if i in action: passed = True break for i in vote_failure_indicators: if i in action and passed: # a quick explanation: originally an exception was # thrown if both passage and failure indicators were # present because I thought that would be a bug in my # lists. Then I found 2007 HB 160. # Now passed = False if the nays outnumber the yays.. # I won't automatically mark it as passed if the yays # ounumber the nays because I don't know what requires # a supermajority in MT. if no_count >= yes_count: passed = False break else: raise Exception("passage and failure indicator" "both present at: %s" % url) if i in action and passed is None: passed = False break for i in vote_ambiguous_indicators: if i in action: passed = yes_count > no_count break if passed is None: raise Exception("Unknown passage at: %s" % url) vote.result = "pass" if passed else "fail" return vote