def test_vote_event_bill_actions_two_stage(): # this test is very similar to what we're testing in test_vote_event_bill_actions w/ # ve3 and ve4, that two bills that reference the same action won't conflict w/ the # OneToOneField, but in this case we do it in two stages so that the conflict is found # even if the votes weren't in the same scrape create_jurisdiction() bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", chamber="lower") bill.add_action(description="passage", date="1900-04-02", chamber="lower") ve1 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-02", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", bill_action="passage", chamber="lower", ) ve2 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-02", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", bill_action="passage", chamber="lower", ) # disambiguate them ve1.dedupe_key = "one" ve2.dedupe_key = "two" bi = BillImporter("jid") bi.import_data([bill.as_dict()]) # first imports just fine VoteEventImporter("jid", bi).import_data([ve1.as_dict()]) votes = list(VoteEvent.objects.all()) assert len(votes) == 1 assert votes[0].bill_action is not None # when second is imported, ensure that action stays pinned to first just as it would # have if they were both in same import VoteEventImporter("jid", bi).import_data([ve1.as_dict(), ve2.as_dict()]) votes = list(VoteEvent.objects.all()) assert len(votes) == 2 assert votes[0].bill_action is not None assert votes[1].bill_action is None
def build_vote(session, bill_id, url, vote_record, chamber, motion_text): # When they vote in a substitute they mark it as XHB bill_id = bill_id.replace("XHB", "HB") passed = len(vote_record["yes"]) > len(vote_record["no"]) vote_event = VoteEvent( result="pass" if passed else "fail", chamber=chamber, start_date=vote_record["date"].strftime("%Y-%m-%d"), motion_text=motion_text, classification="passage", legislative_session=session, bill=bill_id, bill_chamber="upper" if bill_id[0] == "S" else "lower", ) vote_event.dedupe_key = url vote_event.set_count("yes", len(vote_record["yes"])) vote_event.set_count("no", len(vote_record["no"])) vote_event.set_count("excused", len(vote_record["excused"])) vote_event.set_count("absent", len(vote_record["absent"])) vote_event.set_count("other", len(vote_record["other"])) for vote_type in ["yes", "no", "excused", "absent", "other"]: for voter in vote_record[vote_type]: vote_event.vote(vote_type, voter) vote_event.add_source(url) return vote_event
def parse_vote_page(self, vote_url, bill): vote_html = self.get(vote_url).text doc = lxml.html.fromstring(vote_html) # chamber if "senate" in vote_url: chamber = "upper" else: chamber = "lower" # date in the following format: Mar 23, 2009 date = doc.xpath('//td[starts-with(text(), "Legislative")]')[0].text date = date.replace("\xa0", " ") date = datetime.datetime.strptime(date[18:], "%b %d, %Y") # motion motion = "".join(x.text_content() for x in doc.xpath('//td[@colspan="23"]')) if motion == "": motion = "No motion given" # XXX: Double check this. See SJ 3. motion = motion.replace("\xa0", " ") # totals tot_class = doc.xpath('//td[contains(text(), "Yeas")]')[0].get("class") totals = doc.xpath('//td[@class="%s"]/text()' % tot_class)[1:] yes_count = int(totals[0].split()[-1]) no_count = int(totals[1].split()[-1]) other_count = int(totals[2].split()[-1]) other_count += int(totals[3].split()[-1]) other_count += int(totals[4].split()[-1]) passed = yes_count > no_count vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) vote.dedupe_key = vote_url # contains sequence number vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) # go through, find Voting Yea/Voting Nay/etc. and next tds are voters func = None for td in doc.xpath("//td/text()"): td = td.replace("\xa0", " ") if td.startswith("Voting Yea"): func = vote.yes elif td.startswith("Voting Nay"): func = vote.no elif td.startswith("Not Voting"): func = vote.other elif td.startswith("Excused"): func = vote.other elif func: td = td.rstrip("*") func(td) return vote
def add_vote(self, bill, chamber, date, text, url): votes = re.findall(r"Ayes,?[\s]?(\d+)[,;]\s+N(?:oes|ays),?[\s]?(\d+)", text) yes, no = int(votes[0][0]), int(votes[0][1]) vtype = [] for regex, type in motion_classifiers.items(): if re.match(regex, text): vtype = type break v = VoteEvent( chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=text, result="pass" if yes > no else "fail", classification=vtype, bill=bill, ) v.dedupe_key = url.split("/")[-1] v.set_count("yes", yes) v.set_count("no", no) # fetch the vote itself if url: v.add_source(url) if "av" in url: self.add_house_votes(v, url) elif "sv" in url: self.add_senate_votes(v, url) return v
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = VoteEvent( chamber="upper", start_date=date.strftime("%Y-%m-%d"), motion_text="Passage", # setting 'fail' for now. result="fail", classification="passage", bill=bill, ) vote.add_source(url) vote.dedupe_key = url text = convert_pdf(filename, "text").decode("utf-8") os.remove(filename) if re.search(r"Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1] data = list(filter(None, data)) keymap = dict(yea="yes", nay="no") actual_vote = collections.defaultdict(int) vote_count = {"yes": 0, "no": 0, "other": 0} while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), "other") values = data.pop() for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values): if name.lower().strip() == "none.": continue name = name.replace("..", "") name = re.sub(r"\.$", "", name) name = name.strip("-1234567890 \n") if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = ("pass" if vote_count["yes"] > (vote_count["no"] + vote_count["other"]) else "fail") yield vote
def test_vote_event_dedupe_key_dedupe(): j = create_jurisdiction() Organization.objects.create(id="org-id", name="Legislature", classification="legislature", jurisdiction=j) vote_event = ScrapeVoteEvent( legislative_session="1900", start_date="2013", classification="anything", result="passed", motion_text="a vote on something", identifier="Roll Call No. 1", ) vote_event.dedupe_key = "foo" bi = BillImporter("jid") _, what = VoteEventImporter("jid", bi).import_item(vote_event.as_dict()) assert what == "insert" assert VoteEvent.objects.count() == 1 # same exact vote event, no changes _, what = VoteEventImporter("jid", bi).import_item(vote_event.as_dict()) assert what == "noop" assert VoteEvent.objects.count() == 1 # new info, update vote_event.result = "failed" _, what = VoteEventImporter("jid", bi).import_item(vote_event.as_dict()) assert what == "update" assert VoteEvent.objects.count() == 1 # new bill identifier, update vote_event.identifier = "First Roll Call" _, what = VoteEventImporter("jid", bi).import_item(vote_event.as_dict()) assert what == "update" assert VoteEvent.objects.count() == 1 # new identifier, insert vote_event.dedupe_key = "bar" _, what = VoteEventImporter("jid", bi).import_item(vote_event.as_dict()) assert what == "insert" assert VoteEvent.objects.count() == 2
def parse_committee_votes(self, bill, url): bill.add_source(url) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) chamber = "upper" if "Senate" in doc.xpath("string(//h1)") else "lower" committee = tuple(doc.xpath("//h2")[0].itertext())[-2].strip() for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"): # Date for fmt in ("%m/%d/%Y", "%m-%d-%Y"): date = link.xpath("../../td")[0].text_content() try: date = datetime.datetime.strptime(date, fmt) except ValueError: continue break # Motion motion = link.text_content().split(" - ")[-1].strip() motion = "Committee vote (%s): %s" % (committee, motion) # Roll call vote_url = link.attrib["href"] rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url) vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification=[], result="pass" if rollcall["passed"] else "fail", bill=bill, ) vote.dedupe_key = vote_url vote.set_count("yes", rollcall["yes_count"]) vote.set_count("no", rollcall["no_count"]) vote.set_count("other", rollcall["other_count"]) for voteval in ("yes", "no", "other"): for name in rollcall.get(voteval + "_votes", []): vote.vote(voteval, name) vote.add_source(url) vote.add_source(vote_url) yield vote
def scrape_votes(self, bill, bill_page, chamber): vote_links = bill_page.xpath( '//table[contains(@class,"history")]//a[contains(@href, "view_votes")]' ) for vote_link in vote_links: vote_url = vote_link.attrib["href"] date_td, motion_td, *_ = vote_link.xpath("ancestor::tr/td") date = datetime.strptime(date_td.text, "%b %d, %Y") motion_text = motion_td.text_content() vote_page = self.lxmlize(vote_url) passed = "Passed" in motion_text or "Advanced" in motion_text cells = vote_page.xpath( '//div[contains(@class,"table-responsive")]/table//td') vote = VoteEvent( bill=bill, chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=motion_text, classification="passage", result="pass" if passed else "fail", ) yes_count = self.process_count(vote_page, "Yes:") no_count = self.process_count(vote_page, "No:") exc_count = self.process_count(vote_page, "Excused - Not Voting:") absent_count = self.process_count(vote_page, "Absent - Not Voting:") present_count = self.process_count(vote_page, "Present - Not Voting:") vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", exc_count) vote.set_count("absent", absent_count) vote.set_count("abstain", present_count) query_params = urllib.parse.parse_qs( urllib.parse.urlparse(vote_url).query) vote.dedupe_key = query_params["KeyID"][0] vote.add_source(vote_url) for chunk in range(0, len(cells), 2): name = cells[chunk].text vote_type = cells[chunk + 1].text if name and vote_type: vote.vote(VOTE_TYPE_MAP.get(vote_type.lower(), "other"), name) yield vote
def scrape_vote(self, chamber, session, bill_id, vote_url): try: resp = self.get(vote_url) html = resp.text except scrapelib.HTTPError: return doc = lxml.html.fromstring(html) motion = doc.xpath("//p[1]//b[1]/text()")[-1].strip() if len(motion) == 0: print(motion) motion = doc.xpath("//h2[1]/text()")[0].strip() vote_count = ( doc.xpath("//h3[contains(text(),'YEA and ')]/text()")[0].strip().split() ) yeas = int(vote_count[0]) nays = int(vote_count[3]) date = doc.xpath("//b[contains(text(),'Date:')]/../text()")[1].strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vote = VoteEvent( chamber="lower", start_date=date, motion_text=motion, result="pass" if yeas > nays else "fail", classification="passage", legislative_session=session, bill=bill_id, bill_chamber=chamber, ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.add_source(vote_url) vote.dedupe_key = vote_url # first table has YEAs for name in doc.xpath("//table[1]//font/text()"): vote.yes(name.strip()) # second table is nays for name in doc.xpath("//table[2]//font/text()"): vote.no(name.strip()) yield vote
def scrape_votes(self, bill, page): base_url = "https://apps.azleg.gov/api/BillStatusFloorAction" for header in page["FloorHeaders"]: params = { "billStatusId": page["BillId"], "billStatusActionId": header["BillStatusActionId"], "includeVotes": "true", } resp = self.get(base_url, timeout=80, params=params) actions = json.loads(resp.content.decode("utf-8")) for action in actions: if action["Action"] == "No Action": continue if action["ReportDate"] is None: continue cleaned_date = action["ReportDate"].split(".")[0] action_date = datetime.datetime.strptime( cleaned_date, "%Y-%m-%dT%H:%M:%S" ) vote = VoteEvent( chamber={"S": "upper", "H": "lower"}[header["LegislativeBody"]], motion_text=action["Action"], classification="passage", result=( "pass" if action["UnanimouslyAdopted"] or action["Ayes"] > action["Nays"] else "fail" ), start_date=action_date.strftime("%Y-%m-%d"), bill=bill, ) vote.add_source(resp.url) vote.set_count("yes", action["Ayes"] or 0) vote.set_count("no", action["Nays"] or 0) vote.set_count("other", (action["Present"] or 0)) vote.set_count("absent", (action["Absent"] or 0)) vote.set_count("excused", (action["Excused"] or 0)) vote.set_count("not voting", (action["NotVoting"] or 0)) for v in action["Votes"]: vote_type = {"Y": "yes", "N": "no"}.get(v["Vote"], "other") vote.vote(vote_type, v["Legislator"]["FullName"]) vote.dedupe_key = resp.url + str(action["ReferralNumber"]) yield vote
def scrape_chamber_votes(self, chamber, session): url = { "upper": "%s/%s" % (RI_URL_BASE, "SVotes"), "lower": "%s/%s" % (RI_URL_BASE, "HVotes"), }[chamber] action = "%s/%s" % (url, "votes.asp") dates = self.get_vote_dates(url, session) for date in dates: votes = self.parse_vote_page(self.post_to(action, date), url, session) for vote_dict in votes: for vote in vote_dict.values(): count = vote["count"] chamber = { "H": "lower", "S": "upper" }[vote["meta"]["chamber"]] try: bill_id = self._bill_id_by_type[(chamber, vote["meta"]["bill"])] except KeyError: self.warning("no such bill_id %s %s", chamber, vote["meta"]["bill"]) continue v = VoteEvent( chamber=chamber, start_date=vote["time"].strftime("%Y-%m-%d"), motion_text=vote["meta"]["extra"]["motion"], result="pass" if count["passage"] else "fail", classification="passage", legislative_session=session, bill=bill_id, bill_chamber=chamber, ) v.set_count("yes", int(count["YEAS"])) v.set_count("no", int(count["NAYS"])) v.set_count("other", int(count["NOT VOTING"])) v.add_source(vote["source"]) v.dedupe_key = vote["source"] for vt in vote["votes"]: key = {"Y": "yes", "N": "no"}.get(vt["vote"], "other") v.vote(key, vt["name"]) yield v
def asvote(self): v = VoteEvent( chamber=self.chamber(), start_date=self.date(), motion_text=self.motion(), result="pass" if self.passed() else "fail", classification="passage", bill=self.bill, ) v.dedupe_key = self.url # URL contains sequence number v.set_count("yes", self.yes_count()) v.set_count("no", self.no_count()) v.set_count("other", self.other_count()) for voter in self.yes_votes(): v.yes(voter) for voter in self.no_votes(): v.no(voter) for voter in self.other_votes(): v.vote("other", voter) v.add_source(self.url) return v
def scrape_votes(self, url, motion, date, chamber, bill): try: vote_pdf, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("Can't find vote file {}, skipping".format(url)) return text = convert_pdf(vote_pdf, "text") os.remove(vote_pdf) # this way we get a key error on a missing vote type motion, passed = self._vote_mapping[motion] yes_votes = [] no_votes = [] other_votes = [] absent_votes = [] not_voting_votes = [] # point at array to add names to cur_array = None precursors = ( ("yeas--", yes_votes), ("nays--", no_votes), ("absent or those not voting--", absent_votes), ("absent and those not voting--", absent_votes), ("not voting--", not_voting_votes), ("voting present--", other_votes), ("present--", other_votes), ("disclaimer", None), ) # split lines on newline, recombine lines that don't end in punctuation lines = _combine_lines(text.decode().split("\n")) for line in lines: # check if the line starts with a precursor, switch to that array for pc, arr in precursors: if pc in line.lower(): cur_array = arr line = line.replace(pc, "") # split names for name in line.split(","): name = name.strip() # move on if that's all there was if not name: continue # None or a Total indicate the end of a section if "None." in name: cur_array = None match = re.match(r"(.+?)\. Total--.*", name) if match: cur_array.append(match.groups()[0]) cur_array = None # append name if it looks ok junk_in_name = False for junk in ( "on final passage", "Necessary", "who would have", "being a tie", "therefore", "Vacancies", "a pair", "Total-", "ATTORNEY", "on final passage", "SPEAKER", "BOARD", "TREASURER", "GOVERNOR", "ARCHIVES", "SECRETARY", ): if junk in name: junk_in_name = True break if cur_array is not None and not junk_in_name: # strip trailing . if name[-1] == ".": name = name[:-1] name = self.clean_voter_name(name) cur_array.append(name) # return vote object yes_count = len(yes_votes) no_count = len(no_votes) absent_count = len(absent_votes) not_voting_count = len(not_voting_votes) other_count = len(other_votes) vote = VoteEvent( chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=bill, ) vote.dedupe_key = url + "#" + bill.identifier vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("absent", absent_count) vote.set_count("not voting", not_voting_count) vote.set_count("other", other_count) vote.add_source(url) for yes_vote in yes_votes: vote.vote("yes", self.clean_voter_name(yes_vote)) for no_vote in no_votes: vote.vote("no", self.clean_voter_name(no_vote)) for absent_vote in absent_votes: vote.vote("absent", self.clean_voter_name(absent_vote)) for not_voting_vote in not_voting_votes: vote.vote("not voting", self.clean_voter_name(not_voting_vote)) for other_vote in other_votes: vote.vote("other", self.clean_voter_name(other_vote)) yield vote
def scrape_assembly_votes(self, session, bill, assembly_url, bill_id): # parse the bill data page, finding the latest html text url = assembly_url + "&Floor%26nbspVotes=Y" data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(url) if "Votes:" in doc.text_content(): vote_motions = [] additional_votes_on_motion = 2 for table in doc.xpath("//table"): date = table.xpath('caption/span[contains(., "DATE:")]') date = next(date[0].itersiblings()).text date = datetime.datetime.strptime(date, "%m/%d/%Y") date = eastern.localize(date) date = date.isoformat() spanText = table.xpath("caption/span/text()") motion = spanText[2].strip() + spanText[3].strip() if motion in vote_motions: motion = motion + f" - Vote {additional_votes_on_motion}" additional_votes_on_motion += 1 else: vote_motions.append(motion) votes = ( table.xpath("caption/span/span")[0].text.split(":")[1].split("/") ) yes_count, no_count = map(int, votes) passed = yes_count > no_count vote = VoteEvent( chamber="lower", start_date=date, motion_text=motion, bill=bill, result="pass" if passed else "fail", classification="passage", ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) absent_count = 0 excused_count = 0 tds = table.xpath("tr/td/text()") votes = [tds[i : i + 2] for i in range(0, len(tds), 2)] vote_dictionary = { "Y": "yes", "NO": "no", "ER": "excused", "AB": "absent", "NV": "not voting", "EL": "other", } for vote_pair in votes: name, vote_val = vote_pair vote.vote(vote_dictionary[vote_val], name) if vote_val == "AB": absent_count += 1 elif vote_val == "ER": excused_count += 1 vote.set_count("absent", absent_count) vote.set_count("excused", excused_count) vote.add_source(url) vote.dedupe_key = url + motion + spanText[1] yield vote
def scrape_vote_history(self, bill, vurl): """ Obtain the information on a vote and link it to the related Bill :param bill: related bill :param vurl: source for the voteEvent information. :return: voteEvent object """ html = self.get(vurl).text doc = lxml.html.fromstring(html) doc.make_links_absolute(vurl) # skip first two rows for row in doc.xpath("//table/tr")[2:]: tds = row.getchildren() if len(tds) != 11: self.warning("irregular vote row: %s" % vurl) continue ( timestamp, motion, vote, yeas, nays, nv, exc, pres, abst, total, result, ) = tds timestamp = timestamp.text.replace("\xa0", " ") timestamp = datetime.datetime.strptime(timestamp, "%m/%d/%Y %H:%M %p") yeas = int(yeas.text) nays = int(nays.text) others = int(nv.text) + int(exc.text) + int(abst.text) + int(pres.text) assert yeas + nays + others == int(total.text) if result.text == "Passed": passed = "pass" else: passed = "fail" vote_link = vote.xpath("a")[0] if "[H]" in vote_link.text: chamber = "lower" else: chamber = "upper" vote = VoteEvent( chamber=chamber, # 'upper' or 'lower' start_date=timestamp.strftime("%Y-%m-%d"), # 'YYYY-MM-DD' format motion_text=motion.text, result=passed, classification="passage", # Can also be 'other' # Provide a Bill instance to link with the VoteEvent... bill=bill, ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.set_count("other", others) vote.add_source(vurl) # obtain vote rollcall from pdf and add it to the VoteEvent object rollcall_pdf = vote_link.get("href") self.scrape_rollcall(vote, rollcall_pdf) vote.add_source(rollcall_pdf) if rollcall_pdf in self._seen_vote_ids: self.warning("duplicate usage of %s, skipping", rollcall_pdf) continue else: self._seen_vote_ids.add(rollcall_pdf) vote.dedupe_key = rollcall_pdf # distinct KEY for each one yield vote
def scrape_action_page(self, bill, page): action_rows = page.xpath("//tbody/tr") for row in action_rows: action_date = row.xpath("td[1]/text()")[0] action_date = datetime.strptime(action_date, "%m/%d/%Y") action_year = action_date.year action_date = action_date.strftime("%Y-%m-%d") if row.xpath("td[2]/text()"): action_actor = row.xpath("td[2]/text()")[0] action_actor = self.chamber_map_reverse[action_actor.strip()] action_name = row.xpath("string(td[3])") # House votes if "Supplement" in action_name: actor = "lower" if not re.findall(r"(.+)-\s*\d+\s*YEAS", action_name): self.warning( "vote {} did not match regex, skipping".format( action_name)) continue vote_action = re.findall(r"(.+)-\s*\d+\s*YEAS", action_name)[0].strip() y = int(re.findall(r"(\d+)\s*YEAS", action_name)[0]) n = int(re.findall(r"(\d+)\s*NAYS", action_name)[0]) # get supplement number n_supplement = int( re.findall(r"No\.\s*(\d+)", action_name, re.IGNORECASE)[0]) cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result="pass" if y > n else "fail", classification="passage", bill=bill, ) cached_vote.set_count("yes", y) cached_vote.set_count("no", n) housevote_pdf = ( "https://malegislature.gov/Journal/House/{}/{}/RollCalls". format(bill.legislative_session, action_year)) self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement) cached_vote.add_source(housevote_pdf) cached_vote.dedupe_key = "{}#{}".format( housevote_pdf, n_supplement) # XXX: disabled house votes on 8/1 to try to get MA importing again # will leaving this in and commented out once we resolve the ID issue # yield cached_vote # Senate votes if "Roll Call" in action_name: actor = "upper" # placeholder vote_action = action_name.split(" -")[0] # 2019 H86 Breaks our regex, # Ordered to a third reading -- # see Senate Roll Call #25 and House Roll Call 56 if "yeas" in action_name and "nays" in action_name: try: y, n = re.search(r"(\d+) yeas .*? (\d+) nays", action_name.lower()).groups() y = int(y) n = int(n) except AttributeError: y = int( re.search(r"yeas\s+(\d+)", action_name.lower()).group(1)) n = int( re.search(r"nays\s+(\d+)", action_name.lower()).group(1)) # TODO: other count isn't included, set later cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result="pass" if y > n else "fail", classification="passage", bill=bill, ) cached_vote.set_count("yes", y) cached_vote.set_count("no", n) rollcall_pdf = "http://malegislature.gov" + row.xpath( "string(td[3]/a/@href)") self.scrape_senate_vote(cached_vote, rollcall_pdf) cached_vote.add_source(rollcall_pdf) cached_vote.dedupe_key = rollcall_pdf # XXX: also disabled, see above note # yield cached_vote attrs = self.categorizer.categorize(action_name) action = bill.add_action( action_name.strip(), action_date, chamber=action_actor, classification=attrs["classification"], ) for com in attrs.get("committees", []): com = com.strip() action.add_related_entity(com, entity_type="organization")
def scrape_vote(self, bill, vote_id, session): vote_url = ( "https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId" ) form = {"rollCallId": vote_id, "sort": "", "group": "", "filter": ""} self.info("Fetching vote {} for {}".format(vote_id, bill.identifier)) page = self.post(url=vote_url, data=form, allow_redirects=True).json() if page: roll = page["Model"] vote_chamber = self.chamber_map[roll["ChamberName"]] # "7/1/16 01:00 AM" vote_date = dt.datetime.strptime( roll["TakenAtDateTime"], "%m/%d/%y %I:%M %p").strftime("%Y-%m-%d") # TODO: What does this code mean? vote_motion = roll["RollCallVoteType"] vote_passed = "pass" if roll[ "RollCallStatus"] == "Passed" else "fail" other_count = (int(roll["NotVotingCount"]) + int(roll["VacantVoteCount"]) + int(roll["AbsentVoteCount"]) + int(roll["ConflictVoteCount"])) vote = VoteEvent( chamber=vote_chamber, start_date=vote_date, motion_text=vote_motion, result=vote_passed, bill=bill, legislative_session=session, classification=[], ) vote_pdf_url = ("https://legis.delaware.gov" "/json/RollCallController/GenerateRollCallPdf" "?rollCallId={}&chamberId={}".format( vote_id, self.chamber_codes[vote_chamber])) # Vote URL is just a generic search URL with POSTed data, # so provide a different link vote.add_source(vote_pdf_url) vote.dedupe_key = vote_pdf_url vote.set_count("yes", roll["YesVoteCount"]) vote.set_count("no", roll["NoVoteCount"]) vote.set_count("other", other_count) for row in roll["AssemblyMemberVotes"]: # AssemblyMemberId looks like it should work here, # but for some sessions it's bugged to only return session try: voter = self.legislators_by_short[str(row["ShortName"])] name = voter["DisplayName"] except KeyError: self.warning("could not find legislator short name %s", row["ShortName"]) name = row["ShortName"] if row["SelectVoteTypeCode"] == "Y": vote.yes(name) elif row["SelectVoteTypeCode"] == "N": vote.no(name) else: vote.vote("other", name) yield vote
def parse_roll_call(self, bill, link, chamber, date): url = link.attrib["href"] page = self.get(url).text page = lxml.html.fromstring(page) xpath = 'string(//div[@class="Column-OneFourth"]/div[3])' motion = page.xpath(xpath).strip() motion = re.sub(r"\s+", " ", motion) if motion == "FP": motion = "FINAL PASSAGE" if motion == "FINAL PASSAGE": type = "passage" elif re.match(r"CONCUR(RENCE)? IN \w+ AMENDMENTS", motion): type = "amendment" else: type = [] motion = link.text_content() # Looks like for "YEAS" and "NAYS" counts, PA has multiple HTML # formats: one where the "YEAS" text node is nested within a span # element, and another where the text node is a direct child of the div # element yeas_elements = page.xpath("//div/span[text() = 'YEAS']/..") if len(yeas_elements) == 0: yeas_elements = page.xpath("//div[text()[normalize-space() = 'YEAS']]") yeas = int(yeas_elements[0].getnext().text) nays_elements = page.xpath("//div/span[text() = 'NAYS']/..") if len(nays_elements) == 0: nays_elements = page.xpath("//div[text()[normalize-space() = 'NAYS']]") nays = int(nays_elements[0].getnext().text) # "LVE" and "N/V" have been moved up as direct children of the div # element other = 0 lve_elements = page.xpath('//div[text()[normalize-space() = "LVE"]]') if lve_elements: other += int(lve_elements[0].getnext().text) nv_elements = page.xpath('//div[text()[normalize-space() = "N/V"]]') if nv_elements: other += int(nv_elements[0].getnext().text) vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification=type, result="pass" if yeas > (nays + other) else "fail", bill=bill, ) # dedupe_key situation here is a bit weird, same vote can be used for # multiple bills see: # http://www.legis.state.pa.us/CFDOCS/Legis/RC/Public/rc_view_action2.cfm?sess_yr=2017&sess_ind=0&rc_body=H&rc_nbr=11 # noqa # so we toss the bill id onto the end of the URL vote.dedupe_key = url + "#" + bill.identifier vote.add_source(url) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.set_count("other", other) for div in page.xpath('//*[contains(@class, "RollCalls-Vote")]'): name = div[0].tail.strip() name = re.sub(r"^[\s,]+", "", name) name = re.sub(r"[\s,]+$", "", name) class_attr = div.attrib["class"].lower() if "yea" in class_attr: voteval = "yes" elif "nay" in class_attr: voteval = "no" elif "nvote" in class_attr: voteval = "other" elif "lve" in class_attr: voteval = "other" else: msg = "Unrecognized vote val: %s" % class_attr raise Exception(msg) vote.vote(voteval, name) return vote
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " ")) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={"re": re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if "HOUSE" in header.xpath("string()"): chamber = "lower" motion_index = 8 else: chamber = "upper" motion_index = 13 motion = header.xpath("string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r"\s+", " ", motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r"^(.*) (PASSED|FAILED)$", motion) if match: motion = match.group(1) passed = match.group(2) == "PASSED" else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ") rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r"\d+/\d+/\d+", date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace("\r\n", " ").strip() if "*****" in line: break regex = (r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL " r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)") match = re.match(regex, line) if match: if match.group(1) == "YEAS" and "RCS#" not in line: vtype = "yes" seen_yes = True elif match.group(1) == "NAYS" and seen_yes: vtype = "no" elif match.group(1) == "VACANT": continue # skip these elif seen_yes: vtype = "other" if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(" "): if not name: continue if "HOUSE" in name or "SENATE " in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts["yes"] > (counts["no"] + counts["other"]) vote = Vote( chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.set_count("yes", counts["yes"]) vote.set_count("no", counts["no"]) vote.set_count("other", counts["other"]) vote.dedupe_key = url + "#" + rcs vote.add_source(url) for name in votes["yes"]: vote.yes(name) for name in votes["no"]: if ":" in name: raise Exception(name) vote.no(name) for name in votes["other"]: vote.vote("other", name) yield vote
def scrape_vote(self, url, session): fname, _ = self.urlretrieve(url) text = convert_pdf(fname, type="text").decode() lines = text.splitlines() chamber = "upper" if "senate" in url else "lower" if "Maryland" not in text: self.warning(f"empty vote from {url}") return date = re.findall(r"Legislative Date: (\w+ \d+, \d{4})", text)[0] section = "preamble" motion = None bill_id = None how = None voters = defaultdict(list) for line in lines: if section == "preamble": if "vetoed" in line.lower(): self.warning( f"skipping vote that appears to be on prior session: {line}, {bill_id}" ) return possible_bill_id = re.findall(r"([HS][BJR] \d+)", line) if possible_bill_id: bill_id = possible_bill_id[0] # preamble has metadata, then motion, then counts. our process then is to # store the last line as the motion, but if the last line looks like a # continuation, append it to the prior line line = line.strip() counts = re.findall( r"(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused\s+(\d+) Absent", line, ) if counts: yes_count, no_count, nv_count, excused_count, absent_count = counts[ 0] yes_count = int(yes_count) no_count = int(no_count) nv_count = int(nv_count) excused_count = int(excused_count) absent_count = int(absent_count) section = "votes" elif line and line != "(Const)": # questions seem to be split across two lines if line.endswith("?"): motion = motion + " " + line else: motion = line elif section == "votes": if line.startswith("Voting Yea"): how = "yes" elif line.startswith("Voting Nay"): how = "no" elif line.startswith("Not Voting"): how = "not voting" elif line.startswith("Excused from Voting"): how = "excused" elif line.startswith("Excused (Absent)"): how = "absent" elif how: names = re.split(r"\s{2,}", line) voters[how].extend(names) if not bill_id and not motion: return elif bill_id and not motion: self.warning( f"got {bill_id} but no motion, not registering as a vote") elif motion and not bill_id: self.warning( f"got {motion} but no bill_id, not registering as a vote") return # bleh - result not indicated anywhere result = "pass" if yes_count > no_count else "fail" bill_chamber = "upper" if bill_id.startswith("S") else "lower" date = datetime.datetime.strptime(date, "%b %d, %Y").strftime("%Y-%m-%d") vote = VoteEvent( chamber=chamber, start_date=date, result=result, classification="passage", motion_text=motion, legislative_session=session, bill=bill_id, bill_chamber=bill_chamber, ) # URL includes sequence ID, will be unique vote.dedupe_key = url vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", nv_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for how, names in voters.items(): for name in names: name = name.strip().replace("*", "") if name and "COPY" not in name and "Indicates Vote Change" not in name: vote.vote(how, name) check_counts(vote, raise_error=True) return vote
def parse_vote(self, bill, link): # Server sometimes sends proper error headers, # sometimes not try: self.info("Get {}".format(link)) text = requests.get(link).text except requests.exceptions.HTTPError as err: self.warning("{} fetching vote {}, skipping".format(err, link)) return if "Varnish cache server" in text: self.warning("Scrape rate is too high, try re-scraping with " "The --rpm set to a lower number") return if "Page Not Found" in text or "Page Unavailable" in text: self.warning("missing vote, skipping") return member_doc = lxml.html.fromstring(text) motion = member_doc.xpath("//div[@id='main_content']/h4/text()") chamber_date_line = "".join( member_doc.xpath("//div[@id='main_content']/h3[1]//text()")) chamber_date_line_words = chamber_date_line.split() vote_chamber = chamber_date_line_words[0] vote_date = datetime.datetime.strptime(chamber_date_line_words[-1], "%m/%d/%Y") vote_status = " ".join(chamber_date_line_words[2:-2]) opinions = member_doc.xpath( "//div[@id='main_content']/h3[position() > 1]/text()") if len(opinions) > 0: vote_status = vote_status if vote_status.strip() else motion[0] vote_chamber = "upper" if vote_chamber == "Senate" else "lower" for i in opinions: try: count = int(i[i.find("(") + 1:i.find(")")]) except ValueError: # This is likely not a vote-count text chunk # It's probably '`On roll call the vote was:` pass else: if "yea" in i.lower(): yes_count = count elif "nay" in i.lower(): no_count = count elif "present" in i.lower(): p_count = count elif "absent" in i.lower(): a_count = count vote = VoteEvent( bill=bill, start_date=vote_date.strftime("%Y-%m-%d"), chamber=vote_chamber, motion_text=vote_status, result="pass" if yes_count > no_count else "fail", classification="passage", ) vote.dedupe_key = link vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("abstain", p_count) vote.set_count("absent", a_count) vote.add_source(link) a_links = member_doc.xpath("//div[@id='main_content']/a/text()") for i in range(1, len(a_links)): if i <= yes_count: vote.vote("yes", re.sub(",", "", a_links[i]).split()[0]) elif no_count != 0 and i > yes_count and i <= yes_count + no_count: vote.vote("no", re.sub(",", "", a_links[i]).split()[0]) else: vote.vote("other", re.sub(",", "", a_links[i]).split()[0]) yield vote else: self.warning("No Votes for: %s", link)
def scrape_votes(self, session): votes = {} other_counts = defaultdict(int) last_line = [] vote_url = f"http://www.gencourt.state.nh.us/dynamicdatadump/RollCallSummary.txt?x={self.cachebreaker}" lines = self.get(vote_url).content.decode("utf-8").splitlines() for line in lines: if len(line) < 2: continue if line.strip() == "": continue line = line.split("|") if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning("used bad vote line") else: last_line = line self.warning("bad vote line %s" % "|".join(line)) session_yr = line[0].replace("\xef\xbb\xbf", "") body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or "[not available]" if session_yr == session and bill_id in self.bills_by_id: actor = "lower" if body == "H" else "upper" time = dt.datetime.strptime(timestamp, "%m/%d/%Y %I:%M:%S %p") time = pytz.timezone("America/New_York").localize( time).isoformat() # TODO: stop faking passed somehow passed = yeas > nays vote = Vote( chamber=actor, start_date=time, motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=self.bills_by_id[bill_id], ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.add_source(vote_url) vote.dedupe_key = session_yr + body + vote_num # unique ID for vote votes[body + vote_num] = vote for line in (self.get( f"http://www.gencourt.state.nh.us/dynamicdatadump/RollCallHistory.txt?x={self.cachebreaker}" ).content.decode("utf-8").splitlines()): if len(line) < 2: continue # 2016|H|2|330795||Yea| # 2012 | H | 2 | 330795 | 964 | HB309 | Yea | 1/4/2012 8:27:03 PM session_yr, body, v_num, _, employee, bill_id, vote, date = line.split( "|") if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = " ".join(self.legislators[employee]["name"].split()) except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body + v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body + v_num)) continue # code = self.legislators[employee]['seat'] if vote == "Yea": votes[body + v_num].yes(leg) elif vote == "Nay": votes[body + v_num].no(leg) else: votes[body + v_num].vote("other", leg) # hack-ish, but will keep the vote count sync'd other_counts[body + v_num] += 1 votes[body + v_num].set_count("other", other_counts[body + v_num]) for vote in votes.values(): yield vote
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type="text").decode() lines = text.splitlines() if "Senate" in vote_url: chamber = "upper" else: chamber = "lower" date_string = lines[0].split("Calendar Date:")[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if "Yeas" in line and "Nays" in line: page_index = index break vote_counts = 5 * [0] vote_types = ["yes", "no", "not voting", "excused", "absent"] if page_index: counts = re.split(r"\s{2,}", lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(" ", 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] # Consent calendar votes address multiple bills in one VoteEvent # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf is_consent_calendar = any( ["Consent Calendar" in line for line in lines[:page_index]] ) consent_calendar_bills = None motion = "" if is_consent_calendar: motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0] consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip()) assert ( consent_calendar_bills ), "Could not find bills for consent calendar vote" motion_keywords = [ "favorable", "reading", "amendment", "motion", "introduced", "bill pass", "committee", ] motion_lines = [ 3, 2, 4, 5, ] # Relative LineNumbers to be checked for existence of motion for i in motion_lines: if any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): break motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0] else: if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # This condition covers for the bad formating in SB 1260 motion = lines[page_index - 3] if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # Check this one for SB 747 motion = "No motion given" self.warning("No motion given") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) # Include bill ID to avoid duplication for consent calendars vote.dedupe_key = "{}#{}".format(vote_url, bill.identifier) for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = [ "Voting Nay", "Not Voting", "COPY", "Excused", "indicates vote change", "Indicates Vote Change", ] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5 * [0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or "Voting Yea" in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = vote_index + 1 continue names = re.split(r"\s{2,}", current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote
def scrape_pdf_for_votes(self, session, actor, date, motion, href): warned = False # vote indicator, a few spaces, a name, newline or multiple spaces # VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})') COUNT_RE = re.compile( r"^(\d+)\s+YEAS?\s+(\d+)\s+NAYS?\s+(\d+)\s+PRESENT(?:\s+(\d+)\s+NOT\sVOTING)?\s*$" ) PASS_FAIL_WORDS = { "PASSED": "pass", "PREVAILED": "fail", "ADOPTED": "pass", "CONCURRED": "pass", "FAILED": "fail", "LOST": "fail", } pdflines = self.fetch_pdf_lines(href) if not pdflines: return False yes_count = no_count = present_count = 0 yes_votes = [] no_votes = [] present_votes = [] excused_votes = [] not_voting = [] absent_votes = [] passed = None counts_found = False vote_lines = [] for line in pdflines: # consider pass/fail as a document property instead of a result of the vote count # extract the vote count from the document instead of just using counts of names if not line.strip(): continue elif line.strip() in PASS_FAIL_WORDS: # Crash on duplicate pass/fail status that differs from previous status if passed is not None and passed != PASS_FAIL_WORDS[line.strip()]: raise Exception("Duplicate pass/fail matches in [%s]" % href) passed = PASS_FAIL_WORDS[line.strip()] elif COUNT_RE.match(line): (yes_count, no_count, present_count, not_voting_count) = COUNT_RE.match( line ).groups() yes_count = int(yes_count) no_count = int(no_count) present_count = int(present_count) counts_found = True elif counts_found: for value in VOTE_VALUES: if re.search(r"^\s*({})\s+\w".format(value), line): vote_lines.append(line) break votes = find_columns_and_parse(vote_lines) for name, vcode in votes.items(): if name == "Mr. Speaker": name = session_details[session]["speaker"] elif name == "Mr. President": name = session_details[session]["president"] else: # Converts "Davis,William" to "Davis, William". name = re.sub(r"\,([a-zA-Z])", r", \1", name) if vcode == "Y": yes_votes.append(name) elif vcode == "N": no_votes.append(name) elif vcode == "P": present_votes.append(name) elif vcode == "E": excused_votes.append(name) elif vcode == "NV": not_voting.append(name) elif vcode == "A": absent_votes.append(name) # fake the counts if yes_count == 0 and no_count == 0 and present_count == 0: yes_count = len(yes_votes) no_count = len(no_votes) else: # audit if yes_count != len(yes_votes): self.warning( "Mismatched yes count [expect: %i] [have: %i]" % (yes_count, len(yes_votes)) ) warned = True if no_count != len(no_votes): self.warning( "Mismatched no count [expect: %i] [have: %i]" % (no_count, len(no_votes)) ) warned = True if passed is None: if actor["classification"] == "lower": # senate doesn't have these lines self.warning( "No pass/fail word found; fall back to comparing yes and no vote." ) warned = True passed = "pass" if yes_count > no_count else "fail" classification, _ = _categorize_action(motion) vote_event = VoteEvent( legislative_session=session, motion_text=motion, classification=classification, organization=actor, start_date=date, result=passed, ) for name in yes_votes: vote_event.yes(name) for name in no_votes: vote_event.no(name) for name in present_votes: vote_event.vote("other", name) for name in excused_votes: vote_event.vote("excused", name) for name in not_voting: vote_event.vote("not voting", name) for name in absent_votes: vote_event.vote("absent", name) vote_event.set_count("yes", yes_count) vote_event.set_count("no", no_count) vote_event.set_count("other", present_count) vote_event.set_count("excused", len(excused_votes)) vote_event.set_count("absent", len(absent_votes)) vote_event.set_count("not voting", len(not_voting)) vote_event.add_source(href) # for distinguishing between votes with the same id and on same day vote_event.dedupe_key = href if warned: self.warning("Warnings were issued. Best to check %s" % href) return vote_event
def scrape_vote(self, session, bill, vote_url, chamber, date): page = self.lxmlize(vote_url) try: motion = page.xpath("//font/text()")[2] except IndexError: self.warning("Vote Summary Page Broken ") return # eg. http://leg.colorado.gov/content/sb18-033vote563ce6 if ("AM" in motion or "PM" in motion) and "/" in motion: motion = "Motion not given." if "withdrawn" not in motion: yes_no_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Aye')]]/font/text()") other_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Absent')]]/font/text()") abstain_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'17C')]]/font/text()") if not yes_no_counts: self.info("Missing yes no count") return yes_count = int(yes_no_counts[0]) no_count = int(yes_no_counts[2]) exc_count = int(other_counts[2]) absent_count = int(other_counts[0]) abstain_count = 0 if abstain_counts: abstain_count = int(abstain_counts[0]) # fix for # http://leg.colorado.gov/content/hb19-1029vote65e72e if absent_count == -1: absent_count = 0 passed = yes_count > no_count vote = VoteEvent( chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.dedupe_key = vote_url vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", exc_count) vote.set_count("absent", absent_count) vote.set_count("abstain", abstain_count) vote.add_source(vote_url) rolls = page.xpath("//tr[preceding-sibling::tr/descendant::" "td/div/b/font[contains(text(),'Vote')]]") vote_abrv = { "Y": "yes", "N": "no", "E": "excused", "A": "absent", "-": "absent", "17C": "abstain", } for roll in rolls: if len(roll.xpath(".//td/div/font/text()")) > 0: voted = roll.xpath(".//td/div/font/text()")[0].strip() voter = roll.xpath(".//td/font/text()")[0].strip() if voted == "V": continue vote.vote(vote_abrv[voted], voter) yield vote
def scrape_bills(self, session, year_abr): # Main Bill information main_bill_csv = self.to_csv("MAINBILL.TXT") # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) title = rec["Synopsis"] if bill_type[0] == "A": chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill( bill_id, title=title, chamber=chamber, legislative_session=session, classification=self._bill_types[bill_type[1:]], ) if rec["IdenticalBillNumber"].strip(): bill.add_related_bill( rec["IdenticalBillNumber"].split()[0], legislative_session=session, relation_type="companion", ) # TODO: last session info is in there too bill_dict[bill_id] = bill # Sponsors bill_sponsors_csv = self.to_csv("BILLSPON.TXT") for rec in bill_sponsors_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning("unknown bill %s in sponsor database" % bill_id) continue bill = bill_dict[bill_id] name = rec["Sponsor"] sponsor_type = rec["Type"] if sponsor_type == "P": sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsorship( name, classification=sponsor_type, entity_type="person", primary=sponsor_type == "primary", ) # Documents bill_document_csv = self.to_csv("BILLWP.TXT") for rec in bill_document_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning("unknown bill %s in document database" % bill_id) continue bill = bill_dict[bill_id] document = rec["Document"] document = document.split("\\") document = document[-2] + "/" + document[-1] htm_url = "https://www.njleg.state.nj.us/Bills/{}/{}".format( year_abr, document.replace(".DOC", ".HTM")) pdf_url = "https://www.njleg.state.nj.us/Bills/{}/{}".format( year_abr, document.replace(".DOC", ".PDF")) # name document based _doctype try: doc_name = self._doctypes[rec["DocType"]] except KeyError: raise Exception("unknown doctype %s on %s" % (rec["DocType"], bill_id)) if rec["Comment"]: doc_name += " " + rec["Comment"] # Clean links. if htm_url.endswith("HTMX"): htm_url = re.sub("X$", "", htm_url) if pdf_url.endswith("PDFX"): pdf_url = re.sub("X$", "", pdf_url) if rec["DocType"] in self._version_types: if htm_url.lower().endswith("htm"): mimetype = "text/html" elif htm_url.lower().endswith("wpd"): mimetype = "application/vnd.wordperfect" try: bill.add_version_link(doc_name, htm_url, media_type=mimetype) bill.add_version_link(doc_name, pdf_url, media_type="application/pdf") except ValueError: self.warning( "Couldn't find a document for bill {}".format(bill_id)) pass else: bill.add_document_link(doc_name, htm_url) # Votes next_year = int(year_abr) + 1 vote_info_list = [ "A%s" % year_abr, "A%s" % next_year, "S%s" % year_abr, "S%s" % next_year, "CA%s-%s" % (year_abr, next_year), "CS%s-%s" % (year_abr, next_year), ] # keep votes clean globally, a few votes show up in multiple files votes = {} for filename in vote_info_list: s_vote_url = f"https://www.njleg.state.nj.us/votes/{filename}.zip" try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.HTTPError: self.warning("could not find %s" % s_vote_url) continue zippedfile = zipfile.ZipFile(s_vote_zip) for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]: try: vote_file = io.TextIOWrapper(zippedfile.open(vfile, "r"), encoding="latin-1") except KeyError: # # Right, so, 2011 we have an "End" file with more # vote data than was in the original dump. # self.warning("No such file: %s" % (vfile)) continue vdict_file = csv.DictReader(vote_file) if filename.startswith("A") or filename.startswith("CA"): chamber = "lower" else: chamber = "upper" if filename.startswith("C"): vote_file_type = "committee" else: vote_file_type = "chamber" for rec in vdict_file: if vote_file_type == "chamber": bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] vote_parts = (bill_id, chamber, action) else: bill_id = "%s%s" % (rec["Bill_Type"], rec["Bill_Number"]) leg = rec["Name"] # drop time portion date = rec["Agenda_Date"].split()[0] # make motion readable action = self._com_vote_motions[rec["BillAction"]] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec["LegislatorVote"][0:1] committee = rec["Committee_House"] vote_parts = (bill_id, chamber, action, committee) date = datetime.strptime(date, "%m/%d/%Y") vote_id = "_".join(vote_parts).replace(" ", "_") if vote_id not in votes: votes[vote_id] = VoteEvent( start_date=TIMEZONE.localize(date), chamber=chamber, motion_text=action, classification="passage", result=None, bill=bill_dict[bill_id], ) votes[vote_id].dedupe_key = vote_id if leg_vote == "Y": votes[vote_id].vote("yes", leg) elif leg_vote == "N": votes[vote_id].vote("no", leg) else: votes[vote_id].vote("other", leg) # remove temp file os.remove(s_vote_zip) # Counts yes/no/other votes and saves overall vote for vote in votes.values(): counts = collections.defaultdict(int) for count in vote.votes: counts[count["option"]] += 1 vote.set_count("yes", counts["yes"]) vote.set_count("no", counts["no"]) vote.set_count("other", counts["other"]) # Veto override. if vote.motion_text == "OVERRIDE": # Per the NJ leg's glossary, a veto override requires # 2/3ds of each chamber. 27 in the senate, 54 in the house. # http://www.njleg.state.nj.us/legislativepub/glossary.asp if "lower" in vote.bill: vote.result = "pass" if counts["yes"] >= 54 else "fail" elif "upper" in vote.bill: vote.result = "pass" if counts["yes"] >= 27 else "fail" else: # Regular vote. vote.result = "pass" if counts["yes"] > counts[ "no"] else "fail" vote.add_source("http://www.njleg.state.nj.us/downloads.asp") yield vote # Actions bill_action_csv = self.to_csv("BILLHIST.TXT") actor_map = {"A": "lower", "G": "executive", "S": "upper"} for rec in bill_action_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning("unknown bill %s in action database" % bill_id) continue bill = bill_dict[bill_id] action = rec["Action"] date = rec["DateAction"] date = dateutil.parser.parse(date) actor = actor_map[rec["House"]] comment = rec["Comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += " " + comment bill.add_action( action, date=TIMEZONE.localize(date), classification=atype, chamber=actor, ) # Subjects subject_csv = self.to_csv("BILLSUBJ.TXT") for rec in subject_csv: bill_id = rec["BillType"].strip() + str(int(rec["BillNumber"])) if bill_id not in bill_dict: self.warning("unknown bill %s in subject database" % bill_id) continue bill = bill_dict.get(bill_id) if bill: bill.subject.append(rec["SubjectKey"]) else: self.warning("invalid bill id in BillSubj: %s" % bill_id) phony_bill_count = 0 # save all bills at the end for bill in bill_dict.values(): # add sources if not bill.actions and not bill.versions: self.warning("probable phony bill detected %s", bill.identifier) phony_bill_count += 1 else: bill.add_source("http://www.njleg.state.nj.us/downloads.asp") yield bill if phony_bill_count: self.warning("%s total phony bills detected", phony_bill_count)
def process_vote(self, votes, url, base_url, bill, legislators, chamber_dict, vote_results): for v in votes["items"]: try: v["yeas"] except KeyError: # sometimes the actual vote is buried a second layer deep v = self.get(base_url + v["link"]).json() try: v["yeas"] except KeyError: self.logger.warning("No vote info available, skipping") continue try: chamber = chamber_dict[v["chamber"]] except KeyError: chamber = "lower" if "house" in v["apn"] else "upper" try: date = self._tz.localize( datetime.datetime.strptime(v["date"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: try: date = self._tz.localize( datetime.datetime.strptime(v["occurred"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: self.logger.warning("No date found for vote, skipping") continue try: motion = v["action"] except KeyError: motion = v["motiontype"] if motion in self._vote_motion_dict: motion_text = self._vote_motion_dict[motion] else: self.warning( "Unknown vote code {}, please add to _vote_motion_dict". format(motion)) motion_text = v["results"] # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip if not motion and isinstance(v["yeas"], str) and isinstance( v["nays"], str): waringText = 'Malformed JSON found for vote ("revno" of {}); skipping' self.warning(waringText.format(v["revno"])) continue result = v.get("results") or v.get("passed") if result is None: if len(v["yeas"]) > len(v["nays"]): result = "passed" else: result = "failed" passed = vote_results[result.lower()] if "committee" in v: vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion_text, result="pass" if passed else "fail", # organization=v["committee"], bill=bill, classification="committee-passage", ) else: vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion_text, result="pass" if passed else "fail", classification="passage", bill=bill, ) # Concatenate the bill identifier and vote identifier to avoid collisions vote.dedupe_key = "{}:{}".format(bill.identifier.replace(" ", ""), v["revno"]) # the yea and nay counts are not displayed, but vote totals are # and passage status is. yes_count = 0 no_count = 0 absent_count = 0 excused_count = 0 for voter_id in v["yeas"]: vote.yes(legislators[voter_id]) yes_count += 1 for voter_id in v["nays"]: vote.no(legislators[voter_id]) no_count += 1 if "absent" in v: for voter_id in v["absent"]: vote.vote("absent", legislators[voter_id]) absent_count += 1 if "excused" in v: for voter_id in v["excused"]: vote.vote("excused", legislators[voter_id]) excused_count += 1 vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("absent", absent_count) vote.set_count("excused", excused_count) # check to see if there are any other things that look # like vote categories, throw a warning if so for key, val in v.items(): if (type(val) == list and len(val) > 0 and key not in ["yeas", "nays", "absent", "excused"]): if val[0] in legislators: self.logger.warning( "{k} looks like a vote type that's not being counted." " Double check it?".format(k=key)) vote.add_source(url) yield vote
def scrape_house_vote(self, bill, url): try: filename, resp = self.urlretrieve(url, timeout=80) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return text = convert_pdf(filename, "text") os.remove(filename) lines = text.splitlines() vote_type = None votes = collections.defaultdict(list) date = None for idx, line in enumerate(lines): line = line.rstrip().decode("utf-8") match = re.search(r"(\d+)/(\d+)/(\d{4,4})$", line) if match: date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y") continue match = re.match( r"\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)", line) if match: motion = (lines[idx - 2].strip()).decode("utf-8") if not motion: self.warning("No motion text found for vote") motion = "PASSAGE" yes_count, no_count, other_count = [ int(g) for g in match.groups() ] exc_match = re.search(r"EXCUSED: (\d+)", line) if exc_match: other_count += int(exc_match.group(1)) if line.endswith("ADOPTED") or line.endswith("PASSED"): passed = True else: passed = False continue match = re.match( r"(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$", line) if match: vote_type = { "YEAS": "yes", "NAYS": "no", "NOT VOTING": "other", "EXCUSED": "other", "PAIRED": "paired", }[match.group(1)] continue if vote_type == "paired": for part in line.split(" "): part = part.strip() if not part: continue name, pair_type = re.match(r"([^\(]+)\((YEA|NAY)\)", line).groups() name = name.strip() if pair_type == "YEA": votes["yes"].append(name) elif pair_type == "NAY": votes["no"].append(name) elif vote_type: for name in line.split(" "): name = name.strip() if not name: continue votes[vote_type].append(name) if date: vote = VoteEvent( chamber="lower", start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) vote.dedupe_key = url for key, values in votes.items(): for value in values: if "Committee" in value: continue if "*" in value: value = value.replace("*", "") vote.vote(key, value) yield vote else: self.warning("Syntax Error/Warning using 'convert_pdf'")
def scrape_vote(self, bill, date, url): page = self.get(url).json() location = page["actionLog"]["FullName"] if location: if "House" in location: chamber = "lower" elif "Senate" in location: chamber = "upper" elif "Joint" in location: chamber = "legislature" else: self.warning("Bad Vote chamber: '%s', skipping" % location) return else: self.warning("Bad Vote chamber: '%s', skipping" % location) return motion = page["actionLog"]["StatusText"] if motion: # If we can't detect a motion, skip this vote yes_count = page["Yeas"] no_count = page["Nays"] excused_count = page["Excused"] absent_count = page["Absent"] passed = yes_count > no_count if motion.startswith("Do Pass"): vtype = "passage" elif motion == "Concurred in amendments": vtype = "amendment" # commenting out until we add these back to OS-core # elif motion == "Veto override": # vtype = "veto-override" else: vtype = [] vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", classification=vtype, bill=bill, ) # differentiate nearly identical votes vote.dedupe_key = url vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for person in page["RollCalls"]: option = person["Vote1"] if option in ("Aye", "Yea"): vote.yes(person["UniqueName"]) elif option == "Nay": vote.no(person["UniqueName"]) elif option == "Excused": vote.vote("excused", person["UniqueName"]) elif option == "Absent": vote.vote("absent", person["UniqueName"]) yield vote
def _parse_votes(self, url, vote, bill): """Given a vote url and a vote object, extract the voters and the vote counts from the vote page and update the vote object. """ if url.lower().endswith(".pdf"): try: resp = self.get(url) except HTTPError: # This vote document wasn't found. msg = "No document found at url %r" % url self.logger.warning(msg) return try: v = PDFCommitteeVote(url, resp.content, bill) return v.asvote() except PDFCommitteeVoteParseError: # Warn and skip. self.warning("Could't parse committee vote at %r" % url) return html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # Yes, no, excused, absent. try: vals = doc.xpath("//table")[1].xpath("tr/td/text()") except IndexError: # Most likely was a bogus link lacking vote data. return yes_count, no_count, excused_count, absent_count = map(int, vals) # Get the motion. try: motion = doc.xpath("//br")[-1].tail.strip() except IndexError: # Some of them mysteriously have no motion listed. motion = vote["action"] if not motion: motion = vote["action"] vote["motion"] = motion action = vote["action"] vote_url = vote["vote_url"] vote = VoteEvent( chamber=vote["chamber"], start_date=vote["date"], motion_text=vote["motion"], result="fail", # placeholder classification="passage", bill=bill, bill_action=vote["action"], ) vote.dedupe_key = vote_url # URL contains sequence number vote.add_source(vote_url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for text in doc.xpath("//table")[2].xpath("tr/td/text()"): if not text.strip("\xa0"): continue v, name = filter(None, text.split("\xa0")) # Considering Name is brackets as short name regex = re.compile(r".*?\((.*?)\)") short_name = re.findall(regex, name) if len(short_name) > 0: note = "Short Name: " + short_name[0] else: note = "" # Name without brackets like 'Kary, Douglas' name = re.sub(r"[\(\[].*?[\)\]]", "", name) if v == "Y": vote.yes(name, note=note) elif v == "N": vote.no(name, note=note) elif v == "E": vote.vote("excused", name, note=note) elif v == "A": vote.vote("absent", name, note=note) # code to determine value of `passed` passed = None # some actions take a super majority, so we aren't just # comparing the yeas and nays here. for i in vote_passage_indicators: if i in action: passed = True break for i in vote_failure_indicators: if i in action and passed: # a quick explanation: originally an exception was # thrown if both passage and failure indicators were # present because I thought that would be a bug in my # lists. Then I found 2007 HB 160. # Now passed = False if the nays outnumber the yays.. # I won't automatically mark it as passed if the yays # ounumber the nays because I don't know what requires # a supermajority in MT. if no_count >= yes_count: passed = False break else: raise Exception("passage and failure indicator" "both present at: %s" % url) if i in action and passed is None: passed = False break for i in vote_ambiguous_indicators: if i in action: passed = yes_count > no_count break if passed is None: raise Exception("Unknown passage at: %s" % url) vote.result = "pass" if passed else "fail" return vote