def test_vote(): v = Vote('upper', datetime.datetime(2012, 1, 1), 'passage', True, 3, 1, 2, note='note') assert_equal(v, {'chamber': 'upper', 'date': datetime.datetime(2012, 1, 1), 'motion': 'passage', 'passed': True, 'yes_count': 3, 'no_count': 1, 'other_count': 2, 'type': 'other', 'yes_votes': [], 'no_votes': [], 'other_votes': [], 'note': 'note', '_type': 'vote', 'sources': []}) yes_voters = ['Lincoln', 'Adams', 'Johnson'] list(map(v.yes, yes_voters)) assert_equal(v['yes_votes'], yes_voters) no_voters = ['Kennedy'] list(map(v.no, no_voters)) assert_equal(v['no_votes'], no_voters) other_voters = ['Polk', 'Pierce'] list(map(v.other, other_voters)) assert_equal(v['other_votes'], other_voters) # validate should work v.validate() # now add someone else and make sure it doesn't validate v.yes('Clinton') with assert_raises(ValueError): v.validate()
def add_vote(self, bill, chamber, date, text, url): votes = re.findall(r'Ayes,? (\d+)[,;]\s+N(?:oes|ays),? (\d+)', text) (yes, no) = int(votes[0][0]), int(votes[0][1]) vtype = 'other' for regex, type in motion_classifiers.iteritems(): if re.match(regex, text): vtype = type break v = Vote(chamber, date, text, yes > no, yes, no, 0, type=vtype) # fetch the vote itself if url: v.add_source(url) if 'av' in url: self.add_house_votes(v, url) elif 'sv' in url: self.add_senate_votes(v, url) # other count is brute forced v['other_count'] = len(v['other_votes']) v.validate() bill.add_vote(v)
def add_vote(self, bill, chamber, date, text, url): votes = re.findall(r'Ayes,? (\d+)[,;]\s+N(?:oes|ays),? (\d+)', text) (yes, no) = int(votes[0][0]), int(votes[0][1]) vtype = 'other' for regex, type in motion_classifiers.iteritems(): if re.match(regex, text): vtype = type break v = Vote(chamber, date, text, yes > no, yes, no, 0, type=vtype) # fetch the vote itself if url: v.add_source(url) if 'av' in url: self.add_house_votes(v, url) elif 'sv' in url: self.add_senate_votes(v, url) # other count is brute forced v['other_count'] = len(v['other_votes']) v.validate() bill.add_vote(v)
def scrape_vote(self, bill, chamber, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') os.remove(path) try: motion = text.split('\n')[4].strip() except IndexError: return try: yes_count = int(re.search(r'Yeas - (\d+)', text).group(1)) except AttributeError: return no_count = int(re.search(r'Nays - (\d+)', text).group(1)) other_count = int(re.search(r'Not Voting - (\d+)', text).group(1)) passed = yes_count > (no_count + other_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) y,n,o = 0,0,0 break_outter = False for line in text.split('\n')[9:]: if break_outter: break if 'after roll call' in line: break if 'Indication of Vote' in line: break if 'Presiding' in line: continue for col in re.split(r'-\d+', line): col = col.strip() if not col: continue match = re.match(r'(Y|N|EX|\*)\s+(.+)$', col) if match: if match.group(2) == "PAIR": break_outter = True break if match.group(1) == 'Y': vote.yes(match.group(2)) elif match.group(1) == 'N': vote.no(match.group(2)) else: vote.other(match.group(2)) else: vote.other(col.strip()) vote.validate() bill.add_vote(vote)
def scrape_vote(self, bill, action_text, url): doc = lxml.html.fromstring(self.urlopen(url)) date = None yes_count = no_count = other_count = None # process action_text - might look like "Vote - Senate Floor - Third Reading Passed (46-0) - 01/16/12" if action_text.startswith('Vote - Senate Floor - '): action_text = action_text[22:] chamber = 'upper' elif action_text.startswith('Vote - House Floor - '): action_text = action_text[21:] chamber = 'lower' motion, unused_date = action_text.split(' - ') yes_count, no_count = re.findall('\((\d+)-(\d+)\)', motion)[0] if 'Passed' in motion: motion = motion.split(' Passed')[0] passed = True elif 'Adopted' in motion: motion = motion.split(' Adopted')[0] passed = True elif 'Rejected' in motion: motion = motion.split(' Rejected')[0] passed = False elif 'Floor Amendment' in motion: passed = int(yes_count) > int(no_count) else: raise Exception('unknown motion: %s' % motion) vote = Vote(chamber=chamber, date=None, motion=motion, yes_count=int(yes_count), no_count=int(no_count), other_count=0, passed=passed) vfunc = None nobrs = doc.xpath('//nobr/text()') for text in nobrs: text = text.replace(u'\xa0', ' ') if text.startswith('Calendar Date: '): vote['date'] = datetime.datetime.strptime(text.split(': ', 1)[1], '%b %d, %Y %H:%M %p') elif 'Yeas' in text and 'Nays' in text and 'Not Voting' in text: self.debug(text) yeas, nays, nv, exc, absent = re.match('(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused \(Absent\)\s+(\d+) Absent', text).groups() vote['yes_count'] = int(yeas) vote['no_count'] = int(nays) vote['other_count'] = int(nv) + int(exc) + int(absent) elif 'Voting Yea' in text: vfunc = vote.yes elif 'Voting Nay' in text: vfunc = vote.no elif 'Not Voting' in text or 'Excused' in text: vfunc = vote.other elif vfunc: vfunc(text) vote.validate() vote.add_source(url) bill.add_vote(vote)
def scrape_votes(self, bill, link): with self.urlopen(link) as page: page = lxml.html.fromstring(page) raw_vote_data = page.xpath("//span[@id='lblVoteData']")[0].text_content() raw_vote_data = re.split("\w+? by [\w ]+?\s+-", raw_vote_data.strip())[1:] for raw_vote in raw_vote_data: raw_vote = raw_vote.split(u"\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0") motion = raw_vote[0] vote_date = re.search("(\d+/\d+/\d+)", motion) if vote_date: vote_date = datetime.datetime.strptime(vote_date.group(), "%m/%d/%Y") passed = "Passed" in motion or "Recommended for passage" in motion or "Adopted" in raw_vote[1] vote_regex = re.compile("\d+$") aye_regex = re.compile("^.+voting aye were: (.+) -") no_regex = re.compile("^.+voting no were: (.+) -") other_regex = re.compile("^.+present and not voting were: (.+) -") yes_count = 0 no_count = 0 other_count = 0 ayes = [] nos = [] others = [] for v in raw_vote[1:]: v = v.strip() if v.startswith("Ayes...") and vote_regex.search(v): yes_count = int(vote_regex.search(v).group()) elif v.startswith("Noes...") and vote_regex.search(v): no_count = int(vote_regex.search(v).group()) elif v.startswith("Present and not voting...") and vote_regex.search(v): other_count += int(vote_regex.search(v).group()) elif aye_regex.search(v): ayes = aye_regex.search(v).groups()[0].split(", ") elif no_regex.search(v): nos = no_regex.search(v).groups()[0].split(", ") elif other_regex.search(v): others += other_regex.search(v).groups()[0].split(", ") if "ChamberVoting=H" in link: chamber = "lower" else: chamber = "upper" vote = Vote(chamber, vote_date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) for a in ayes: vote.yes(a) for n in nos: vote.no(n) for o in others: vote.other(o) vote.validate() bill.add_vote(vote) return bill
def test_vote(): v = Vote('upper', datetime.datetime(2012, 1, 1), 'passage', True, 3, 1, 2, note='note') assert_equal( v, { 'chamber': 'upper', 'date': datetime.datetime(2012, 1, 1), 'motion': 'passage', 'passed': True, 'yes_count': 3, 'no_count': 1, 'other_count': 2, 'type': 'other', 'yes_votes': [], 'no_votes': [], 'other_votes': [], 'note': 'note', '_type': 'vote', 'sources': [] }) yes_voters = ['Lincoln', 'Adams', 'Johnson'] list(map(v.yes, yes_voters)) assert_equal(v['yes_votes'], yes_voters) no_voters = ['Kennedy'] list(map(v.no, no_voters)) assert_equal(v['no_votes'], no_voters) other_voters = ['Polk', 'Pierce'] list(map(v.other, other_voters)) assert_equal(v['other_votes'], other_voters) # validate should work v.validate() # now add someone else and make sure it doesn't validate v.yes('Clinton') with assert_raises(ValueError): v.validate()
def scrape_vote(self, bill, chamber, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') os.remove(path) motion = text.split('\n')[4].strip() yes_count = int(re.search(r'Yeas - (\d+)', text).group(1)) no_count = int(re.search(r'Nays - (\d+)', text).group(1)) other_count = int(re.search(r'Not Voting - (\d+)', text).group(1)) passed = yes_count > (no_count + other_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for line in text.split('\n')[9:]: if 'after roll call' in line: break if 'Indication of Vote' in line: break if 'Presiding' in line: continue for col in re.split(r'-\d+', line): col = col.strip() if not col: continue match = re.match(r'(Y|N|EX|\*)\s+(.+)$', col) if match: if match.group(1) == 'Y': vote.yes(match.group(2)) elif match.group(1) == 'N': vote.no(match.group(2)) elif match.group(1) == '*': pass # skip paired voters, don't factor into count else: vote.other(match.group(2)) else: vote.other(col.strip()) vote.validate() bill.add_vote(vote)
def scrape_vote(self, bill, chamber, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, "text") os.remove(path) motion = text.split("\n")[4].strip() yes_count = int(re.search(r"Yeas - (\d+)", text).group(1)) no_count = int(re.search(r"Nays - (\d+)", text).group(1)) other_count = int(re.search(r"Not Voting - (\d+)", text).group(1)) passed = yes_count > (no_count + other_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for line in text.split("\n")[9:]: if "after roll call" in line: break if "Indication of Vote" in line: break if "Presiding" in line: continue for col in re.split(r"-\d+", line): col = col.strip() if not col: continue match = re.match(r"(Y|N|EX)\s+(.+)$", col) if match: if match.group(1) == "Y": vote.yes(match.group(2)) elif match.group(1) == "N": vote.no(match.group(2)) else: vote.other(match.group(2)) else: vote.other(col.strip()) vote.validate() bill.add_vote(vote)
def scrape_vote(self, bill, chamber, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') os.remove(path) motion = text.split('\n')[4].strip() yes_count = int(re.search(r'Yeas - (\d+)', text).group(1)) no_count = int(re.search(r'Nays - (\d+)', text).group(1)) other_count = int(re.search(r'Not Voting - (\d+)', text).group(1)) passed = yes_count > (no_count + other_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for line in text.split('\n')[9:]: if 'after roll call' in line: break if 'Presiding' in line: continue for col in re.split(r'-\d+', line): col = col.strip() if not col: continue match = re.match(r'(Y|N|EX)\s+(.+)$', col) if match: if match.group(1) == 'Y': vote.yes(match.group(2)) elif match.group(1) == 'N': vote.no(match.group(2)) else: vote.other(match.group(2)) else: vote.other(col.strip()) vote.validate() bill.add_vote(vote)
def add_vote(self, bill, chamber, date, text, url): votes = re.findall(r"Ayes (\d+)\, N(?:oes|ays) (\d+)", text) (yes, no) = int(votes[0][0]), int(votes[0][1]) vtype = "other" for regex, type in motion_classifiers.iteritems(): if re.match(regex, text): vtype = type break v = Vote(chamber, date, text, yes > no, yes, no, 0, type=vtype) # fetch the vote itself if url: v.add_source(url) if "av" in url: self.add_house_votes(v, url) elif "sv" in url: self.add_senate_votes(v, url) v.validate() bill.add_vote(v)
def scrape_lower_committee_votes(self, session_number, bill): ''' House committee roll calls are not available on the Senate's website. Furthermore, the House uses an internal ID system in its URLs, making accessing those pages non-trivial. This function will fetch all the House committee votes for the given bill, and add the votes to that object. ''' house_url = 'http://www.myfloridahouse.gov/Sections/Bills/bills.aspx' # Keep the digits and all following characters in the bill's ID bill_number = re.search(r'^\w+\s(\d+\w*)$', bill['bill_id']).group(1) form = { 'rblChamber': 'B', 'ddlSession': session_number, 'ddlBillList': '-1', 'txtBillNumber': bill_number, 'ddlSponsor': '-1', 'ddlReferredTo': '-1', 'SubmittedByControl': '', } doc = lxml.html.fromstring(self.post(url=house_url, data=form).text) doc.make_links_absolute(house_url) (bill_link, ) = doc.xpath( '//a[contains(@href, "/Bills/billsdetail.aspx?BillId=")]/@href') bill_doc = self.lxmlize(bill_link) links = bill_doc.xpath('//a[text()="See Votes"]/@href') for link in links: vote_doc = self.lxmlize(link) (date, ) = vote_doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()') date = datetime.datetime.strptime(date, '%m/%d/%Y %I:%M:%S %p').date() totals = vote_doc.xpath('//table//table')[-1].text_content() totals = re.sub(r'(?mu)\s+', " ", totals).strip() (yes_count, no_count, other_count) = [ int(x) for x in re.search( r'(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+' 'Total Missed:\s+(\d+)', totals).groups() ] passed = yes_count > no_count (committee, ) = vote_doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()') (action, ) = vote_doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()') motion = "{} ({})".format(action, committee) vote = Vote('lower', date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) for member_vote in vote_doc.xpath('//table//table//table//td'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath('span[2]//text()') (member_vote, ) = member_vote.xpath('span[1]//text()') if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.other(member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r'\([YN]\)', member_vote): continue else: raise IndexError( "Unknown vote type found: {}".format(member_vote)) vote.validate() bill.add_vote(vote)
def scrape_uppper_committee_vote(self, bill, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') lines = text.split("\n") os.remove(path) (_, motion) = lines[5].split("FINAL ACTION:") motion = motion.strip() if not motion: self.warning("Vote appears to be empty") return vote_top_row = [ lines.index(x) for x in lines if re.search(r'^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$', x) ][0] yea_columns_end = lines[vote_top_row].index("Yea") + len("Yea") nay_columns_begin = lines[vote_top_row].index("Nay") votes = {'yes': [], 'no': [], 'other': []} for line in lines[(vote_top_row + 1):]: if line.strip(): member = re.search( r'''(?x) ^\s+(?:[A-Z\-]+)?\s+ # Possible vote indicator ([A-Z][a-z]+ # Name must have lower-case characters [\w\-\s]+) # Continue looking for the rest of the name (?:,[A-Z\s]+?)? # Leadership has an all-caps title (?:\s{2,}.*)? # Name ends when many spaces are seen ''', line).group(1) # Usually non-voting members won't even have a code listed # Only a couple of codes indicate an actual vote: # "VA" (vote after roll call) and "VC" (vote change) did_vote = bool(re.search(r'^\s+(X|VA|VC)\s+[A-Z][a-z]', line)) if did_vote: # Check where the "X" or vote code is on the page vote_column = len(line) - len(line.lstrip()) if vote_column <= yea_columns_end: votes['yes'].append(member) elif vote_column >= nay_columns_begin: votes['no'].append(member) else: raise AssertionError( "Unparseable vote found for {0} in {1}:\n{2}". format(member, url, line)) else: votes['other'].append(member) # End loop as soon as no more members are found else: break totals = re.search(r'(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS', text).groups() yes_count = int(totals[0]) no_count = int(totals[1]) passed = (yes_count > no_count) other_count = len(votes['other']) vote = Vote('upper', date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) vote['yes_votes'] = votes['yes'] vote['no_votes'] = votes['no'] vote['other_votes'] = votes['other'] vote.validate() bill.add_vote(vote)
def scrape_floor_vote(self, chamber, bill, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') lines = text.split("\n") os.remove(path) MOTION_INDEX = 4 TOTALS_INDEX = 6 VOTE_START_INDEX = 9 motion = lines[MOTION_INDEX].strip() # Sometimes there is no motion name, only "Passage" in the line above if (not motion and not lines[MOTION_INDEX - 1].startswith("Calendar Page:")): motion = lines[MOTION_INDEX - 1] MOTION_INDEX -= 1 TOTALS_INDEX -= 1 VOTE_START_INDEX -= 1 else: assert motion, "Floor vote's motion name appears to be empty" for _extra_motion_line in range(2): MOTION_INDEX += 1 if lines[MOTION_INDEX].strip(): motion = "{}, {}".format(motion, lines[MOTION_INDEX].strip()) TOTALS_INDEX += 1 VOTE_START_INDEX += 1 else: break (yes_count, no_count, other_count) = [ int(x) for x in re.search( r'^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$', lines[TOTALS_INDEX]).groups() ] passed = (yes_count > no_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for line in lines[VOTE_START_INDEX:]: if not line.strip(): break if " President " in line: line = line.replace(" President ", " ") elif " Speaker " in line: line = line.replace(" Speaker ", " ") # Votes follow the pattern of: # [vote code] [member name]-[district number] for member in re.findall(r'\s*Y\s+(.*?)-\d{1,3}\s*', line): vote.yes(member) for member in re.findall(r'\s*N\s+(.*?)-\d{1,3}\s*', line): vote.no(member) for member in re.findall(r'\s*(?:EX|AV)\s+(.*?)-\d{1,3}\s*', line): vote.other(member) try: vote.validate() except ValueError: # On a rare occasion, a member won't have a vote code, # which indicates that they didn't vote. The totals reflect # this. self.logger.info("Votes don't add up; looking for additional ones") for line in lines[VOTE_START_INDEX:]: if not line.strip(): break for member in re.findall(r'\s{8,}([A-Z][a-z\'].*?)-\d{1,3}', line): vote.other(member) vote.validate() bill.add_vote(vote)
def scrape_vote(self, bill, action_text, url): doc = lxml.html.fromstring(self.urlopen(url)) date = None yes_count = no_count = other_count = None # process action_text - might look like "Vote - Senate Floor - Third Reading Passed (46-0) - 01/16/12" if action_text.startswith('Vote - Senate Floor - '): action_text = action_text[22:] chamber = 'upper' elif action_text.startswith('Vote - House Floor - '): action_text = action_text[21:] chamber = 'lower' motion, unused_date = action_text.rsplit(' - ', 1) yes_count, no_count = re.findall('\((\d+)-(\d+)\)', motion)[0] if 'Passed' in motion: motion = motion.split(' Passed')[0] passed = True elif 'Adopted' in motion: motion = motion.split(' Adopted')[0] passed = True elif 'Rejected' in motion: motion = motion.split(' Rejected')[0] passed = False elif 'Failed' in motion: motion = motion.split(' Failed')[0] passed = False elif 'Floor Amendment' in motion: passed = int(yes_count) > int(no_count) else: raise Exception('unknown motion: %s' % motion) vote = Vote(chamber=chamber, date=None, motion=motion, yes_count=int(yes_count), no_count=int(no_count), other_count=0, passed=passed) vfunc = None nobrs = doc.xpath('//nobr/text()') for text in nobrs: text = text.replace(u'\xa0', ' ') if text.startswith('Calendar Date: '): vote['date'] = datetime.datetime.strptime( text.split(': ', 1)[1], '%b %d, %Y %H:%M %p') elif 'Yeas' in text and 'Nays' in text and 'Not Voting' in text: self.debug(text) yeas, nays, nv, exc, absent = re.match( '(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused \(Absent\)\s+(\d+) Absent', text).groups() vote['yes_count'] = int(yeas) vote['no_count'] = int(nays) vote['other_count'] = int(nv) + int(exc) + int(absent) elif 'Voting Yea' in text: vfunc = vote.yes elif 'Voting Nay' in text: vfunc = vote.no elif 'Not Voting' in text or 'Excused' in text: vfunc = vote.other elif vfunc: vfunc(text) vote.validate() vote.add_source(url) bill.add_vote(vote)
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u'\xa0', ' ')) re_ns = "http://exslt.org/regular-expressions" path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={'re': re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if 'HOUSE' in header.xpath("string()"): chamber = 'lower' motion_index = 8 else: chamber = 'upper' motion_index = 13 motion = header.xpath("string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r'\s+', ' ', motion) assert motion.strip(), "Motion text not found" match = re.match(r'^(.*) (PASSED|FAILED)$', motion) if match: motion = match.group(1) passed = match.group(2) == 'PASSED' else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ') rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1) date_line = rcs_p.getnext().xpath("string()") date = re.search(r'\d+/\d+/\d+', date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace('\r\n', ' ').strip() if "*****" in line: break match = re.match( r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)', line) if match: if match.group(1) == 'YEAS' and 'RCS#' not in line: vtype = 'yes' seen_yes = True elif match.group(1) == 'NAYS' and seen_yes: vtype = 'no' elif match.group(1) == 'VACANT': continue # skip these elif seen_yes: vtype = 'other' if seen_yes and match.group(3).strip(): self.logger.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(' '): if not name: continue if 'HOUSE' in name or 'SENATE ' in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts['yes'] > (counts['no'] + counts['other']) vote = Vote(chamber, date, motion, passed, counts['yes'], counts['no'], counts['other'], rcs_num=rcs) vote.validate() vote.add_source(url) for name in votes['yes']: vote.yes(name) for name in votes['no']: if ':' in name: raise Exception(name) vote.no(name) for name in votes['other']: vote.other(name) vote.validate() bill.add_vote(vote)
def scrape_votes(self, bill, link): page = self.urlopen(link) page = lxml.html.fromstring(page) raw_vote_data = page.xpath("//span[@id='lblVoteData']")[0].text_content() raw_vote_data = re.split('\w+? by [\w ]+?\s+-', raw_vote_data.strip())[1:] for raw_vote in raw_vote_data: raw_vote = raw_vote.split(u'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0') motion = raw_vote[0] vote_date = re.search('(\d+/\d+/\d+)', motion) if vote_date: vote_date = datetime.datetime.strptime(vote_date.group(), '%m/%d/%Y') passed = ('Passed' in motion or 'Recommended for passage' in motion or 'Adopted' in raw_vote[1] ) vote_regex = re.compile('\d+$') aye_regex = re.compile('^.+voting aye were: (.+) -') no_regex = re.compile('^.+voting no were: (.+) -') other_regex = re.compile('^.+present and not voting were: (.+) -') yes_count = 0 no_count = 0 other_count = 0 ayes = [] nos = [] others = [] for v in raw_vote[1:]: v = v.strip() if v.startswith('Ayes...') and vote_regex.search(v): yes_count = int(vote_regex.search(v).group()) elif v.startswith('Noes...') and vote_regex.search(v): no_count = int(vote_regex.search(v).group()) elif v.startswith('Present and not voting...') and vote_regex.search(v): other_count += int(vote_regex.search(v).group()) elif aye_regex.search(v): ayes = aye_regex.search(v).groups()[0].split(', ') elif no_regex.search(v): nos = no_regex.search(v).groups()[0].split(', ') elif other_regex.search(v): others += other_regex.search(v).groups()[0].split(', ') if 'ChamberVoting=H' in link: chamber = 'lower' else: chamber = 'upper' vote = Vote(chamber, vote_date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) for a in ayes: vote.yes(a) for n in nos: vote.no(n) for o in others: vote.other(o) vote.validate() bill.add_vote(vote) return bill
def scrape(self, session, chambers): HTML_TAGS_RE = r'<.*?>' year_slug = session[5: ] # Load all bills and resolutions via the private API bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)['data'] resolutions_url = \ 'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\ format(year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)['data']) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = { k:v.strip() for k, v in info.iteritems() } # Identify the bill type and chamber if info['BillNumber'].startswith('J.R.H.'): bill_type = 'joint resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('J.R.S.'): bill_type = 'joint resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.R.'): bill_type = 'resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.R.'): bill_type = 'resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('PR.'): bill_type = 'constitutional amendment' if info['Body'] == 'H': bill_chamber = 'lower' elif info['Body'] == 'S': bill_chamber = 'upper' else: raise AssertionError("Amendment not tied to chamber") elif info['BillNumber'].startswith('H.'): bill_type = 'bill' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.'): bill_type = 'bill' bill_chamber = 'upper' else: raise AssertionError( "Unknown bill type found: '{}'". format(info['BillNumber'])) # Create the bill using its basic information bill = Bill( session=session, bill_id=info['BillNumber'], title=info['Title'], chamber=bill_chamber, type=bill_type ) if 'resolution' in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = \ 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\ format(year_slug, info['BillNumber']) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' 'following-sibling::dd[1]/ul/li' ) sponsor_type = 'primary' for sponsor in sponsors: if sponsor.xpath('span/text()') == ['Additional Sponsors']: sponsor_type = 'cosponsor' continue sponsor_name = sponsor.xpath('a/text()')[0].\ replace("Rep.", "").replace("Sen.", "").strip() if sponsor_name and not \ (sponsor_name[ :5] == "Less" and len(sponsor_name) == 5): bill.add_sponsor(sponsor_type, sponsor_name) # Capture bill text versions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' 'following-sibling::dd[1]/ul/li/a' ) for version in versions: bill.add_version( name=version.xpath('text()')[0], url=version.xpath('@href')[0].replace(' ', '%20'), mimetype='application/pdf' ) # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/{}/(\d+)"'.format(year_slug), lxml.etree.tostring(doc) ).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".\ format(info['BillNumber'])) self.save_bill(bill) continue # Capture actions actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\ format(year_slug, internal_bill_id) actions_json = self.get(actions_url).text actions = json.loads(actions_json)['data'] bill.add_source(actions_url) chambers_passed = set() for action in actions: action = { k:v.strip() for k, v in action.iteritems() } if "Signed by Governor" in action['FullStatus']: actor = 'governor' elif action['ChamberCode'] == 'H': actor = 'lower' elif action['ChamberCode'] == 'S': actor = 'upper' else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action['FullStatus']: assert chambers_passed == set("HS") action_type = 'governor:signed' elif actor == 'lower' and \ action['FullStatus'] in ( "Passed", "Read Third time and Passed", "Adopted", "Adopted in Concurrence", "Read and Adopted", "Read and Adopted in Concurrence", "Passed in Concurrence", "Passed in Concurrence with Proposal of Amendment"): action_type = 'bill:passed' assert "H" not in chambers_passed chambers_passed.add("H") elif actor == 'upper' and \ any(action['FullStatus'].startswith(x) for x in ( "Read 3rd time & passed", "Read & adopted", "Adopted")): action_type = 'bill:passed' assert "S" not in chambers_passed chambers_passed.add("S") else: action_type = 'other' bill.add_action( actor=actor, action=re.sub(HTML_TAGS_RE, "", action['FullStatus']), date=datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'), type=action_type ) # Capture votes votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.\ format(year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)['data'] bill.add_source(votes_url) for vote in votes: roll_call_id = vote['VoteHeaderID'] roll_call_url = 'http://legislature.vermont.gov/bill/loadBillRollCallDetails/{0}/{1}'.\ format(year_slug, roll_call_id) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)['data'] roll_call_yea = [] roll_call_nay = [] roll_call_other = [] for member in roll_call: (member_name, _district) = member['MemberName'].split(" of ") member_name = member_name.strip() if member['MemberVote'] == "Yea": roll_call_yea.append(member_name) elif member['MemberVote'] == "Nay": roll_call_nay.append(member_name) else: roll_call_other.append(member_name) if "Passed -- " in vote['FullStatus']: did_pass = True elif "Failed -- " in vote['FullStatus']: did_pass = False else: raise AssertionError("Roll call vote result is unclear") # Check vote counts yea_count = \ int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1)) nay_count = \ int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1)) vote_to_add = Vote( chamber=( 'lower' if vote['ChamberCode'] == 'H' else 'upper' ), date=datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'), motion=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(), passed=did_pass, yes_count=yea_count, no_count=nay_count, other_count=len(roll_call_other) ) vote_to_add.add_source(roll_call_url) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_other: vote_to_add.other(member) try: vote_to_add.validate() except ValueError as e: self.warning(e) bill.add_vote(vote_to_add) # Capture extra information # This is not in the OpenStates spec, but is available # Not yet implemented # Witnesses: http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} self.save_bill(bill)
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.urlopen(url).replace(u'\xa0', ' ')) re_ns = "http://exslt.org/regular-expressions" path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={'re': re_ns}): if 'HOUSE' in header.xpath("string()"): chamber = 'lower' motion_index = 8 else: chamber = 'upper' motion_index = 9 motion = header.xpath( "string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r'\s+', ' ', motion) match = re.match(r'^(.*) (PASSED|FAILED)$', motion) if match: motion = match.group(1) passed = match.group(2) == 'PASSED' else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ') rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1) date_line = rcs_p.getnext().xpath("string()") date = re.search(r'\d+/\d+/\d+', date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace('\r\n', ' ').strip() if "*****" in line: break match = re.match( r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL PRIVILEGE|NOT VOTING)\s*:\s*(\d+)', line) if match: if match.group(1) == 'YEAS': vtype = 'yes' elif match.group(1) == 'NAYS': vtype = 'no' elif match.group(1) == 'VACANT': continue # skip these else: vtype = 'other' counts[vtype] += int(match.group(2)) else: for name in line.split(' '): if not name: continue if 'HOUSE BILL' in name or 'SENATE BILL' in name: continue votes[vtype].append(name.strip()) if passed is None: passed = counts['yes'] > (counts['no'] + counts['other']) if not motion: motion = 'Senate Vote' if chamber == 'upper' else 'House Vote' vote = Vote(chamber, date, motion, passed, counts['yes'], counts['no'], counts['other'], rcs_num=rcs) vote.validate() vote.add_source(url) for name in votes['yes']: vote.yes(name) for name in votes['no']: vote.no(name) for name in votes['other']: vote.other(name) bill.add_vote(vote)
def scrape_votes(self, bill, link): page = self.urlopen(link) page = lxml.html.fromstring(page) raw_vote_data = page.xpath( "//span[@id='lblVoteData']")[0].text_content() raw_vote_data = re.split('\w+? by [\w ]+?\s+-', raw_vote_data.strip())[1:] for raw_vote in raw_vote_data: raw_vote = raw_vote.split( u'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0') motion = raw_vote[0] vote_date = re.search('(\d+/\d+/\d+)', motion) if vote_date: vote_date = datetime.datetime.strptime(vote_date.group(), '%m/%d/%Y') passed = ('Passed' in motion or 'Recommended for passage' in motion or 'Adopted' in raw_vote[1]) vote_regex = re.compile('\d+$') aye_regex = re.compile('^.+voting aye were: (.+) -') no_regex = re.compile('^.+voting no were: (.+) -') other_regex = re.compile('^.+present and not voting were: (.+) -') yes_count = 0 no_count = 0 other_count = 0 ayes = [] nos = [] others = [] for v in raw_vote[1:]: v = v.strip() if v.startswith('Ayes...') and vote_regex.search(v): yes_count = int(vote_regex.search(v).group()) elif v.startswith('Noes...') and vote_regex.search(v): no_count = int(vote_regex.search(v).group()) elif v.startswith( 'Present and not voting...') and vote_regex.search(v): other_count += int(vote_regex.search(v).group()) elif aye_regex.search(v): ayes = aye_regex.search(v).groups()[0].split(', ') elif no_regex.search(v): nos = no_regex.search(v).groups()[0].split(', ') elif other_regex.search(v): others += other_regex.search(v).groups()[0].split(', ') if 'ChamberVoting=H' in link: chamber = 'lower' else: chamber = 'upper' vote = Vote(chamber, vote_date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) for a in ayes: vote.yes(a) for n in nos: vote.no(n) for o in others: vote.other(o) vote.validate() bill.add_vote(vote) return bill
def scrape_uppper_committee_vote(self, bill, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') lines = text.split("\n") os.remove(path) (_, motion) = lines[5].split("FINAL ACTION:") motion = motion.strip() if not motion: self.warning("Vote appears to be empty") return vote_top_row = [ lines.index(x) for x in lines if re.search(r'^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$', x)][0] yea_columns_end = lines[vote_top_row].index("Yea") + len("Yea") nay_columns_begin = lines[vote_top_row].index("Nay") votes = {'yes': [], 'no': [], 'other': []} for line in lines[(vote_top_row + 1):]: if line.strip(): member = re.search(r'''(?x) ^\s+(?:[A-Z\-]+)?\s+ # Possible vote indicator ([A-Z][a-z]+ # Name must have lower-case characters [\w\-\s]+) # Continue looking for the rest of the name (?:,[A-Z\s]+?)? # Leadership has an all-caps title (?:\s{2,}.*)? # Name ends when many spaces are seen ''', line).group(1) # Usually non-voting members won't even have a code listed # Only a couple of codes indicate an actual vote: # "VA" (vote after roll call) and "VC" (vote change) did_vote = bool( re.search(r'^\s+(X|VA|VC)\s+[A-Z][a-z]', line)) if did_vote: # Check where the "X" or vote code is on the page vote_column = len(line) - len(line.lstrip()) if vote_column <= yea_columns_end: votes['yes'].append(member) elif vote_column >= nay_columns_begin: votes['no'].append(member) else: raise AssertionError( "Unparseable vote found for {0} in {1}:\n{2}". format(member, url, line)) else: votes['other'].append(member) # End loop as soon as no more members are found else: break totals = re.search( r'(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS', text).groups() yes_count = int(totals[0]) no_count = int(totals[1]) passed = (yes_count > no_count) other_count = len(votes['other']) vote = Vote('upper', date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) vote['yes_votes'] = votes['yes'] vote['no_votes'] = votes['no'] vote['other_votes'] = votes['other'] vote.validate() bill.add_vote(vote)
def scrape_lower_committee_votes(self, session_number, bill): """ House committee roll calls are not available on the Senate's website. Furthermore, the House uses an internal ID system in its URLs, making accessing those pages non-trivial. This function will fetch all the House committee votes for the given bill, and add the votes to that object. """ house_url = "http://www.myfloridahouse.gov/Sections/Bills/bills.aspx" # Keep the digits and all following characters in the bill's ID bill_number = re.search(r"^\w+\s(\d+\w*)$", bill["bill_id"]).group(1) form = { "rblChamber": "B", "ddlSession": session_number, "ddlBillList": "-1", "txtBillNumber": bill_number, "ddlSponsor": "-1", "ddlReferredTo": "-1", "SubmittedByControl": "", } doc = lxml.html.fromstring(self.post(url=house_url, data=form).text) doc.make_links_absolute(house_url) (bill_link,) = doc.xpath('//a[contains(@href, "/Bills/billsdetail.aspx?BillId=")]/@href') bill_doc = self.lxmlize(bill_link) links = bill_doc.xpath('//a[text()="See Votes"]/@href') for link in links: vote_doc = self.lxmlize(link) (date,) = vote_doc.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()') date = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p").date() totals = vote_doc.xpath("//table//table")[-1].text_content() totals = re.sub(r"(?mu)\s+", " ", totals).strip() (yes_count, no_count, other_count) = [ int(x) for x in re.search( r"(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+" "Total Missed:\s+(\d+)", totals ).groups() ] passed = yes_count > no_count (committee,) = vote_doc.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()') (action,) = vote_doc.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()') motion = "{} ({})".format(action, committee) vote = Vote("lower", date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) for member_vote in vote_doc.xpath("//table//table//table//td"): if not member_vote.text_content().strip(): continue (member,) = member_vote.xpath("span[2]//text()") (member_vote,) = member_vote.xpath("span[1]//text()") if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.other(member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r"\([YN]\)", member_vote): continue else: raise IndexError("Unknown vote type found: {}".format(member_vote)) vote.validate() bill.add_vote(vote)
def scrape(self, session, chambers): HTML_TAGS_RE = r'<.*?>' year_slug = session[5:] # Load all bills and resolutions via the private API bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)['data'] or [] bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)['data'] or []) resolutions_url = \ 'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\ format(year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)['data'] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.iteritems()} # Identify the bill type and chamber if info['BillNumber'].startswith('J.R.H.'): bill_type = 'joint resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('J.R.S.'): bill_type = 'joint resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.R.'): bill_type = 'resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.R.'): bill_type = 'resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('PR.'): bill_type = 'constitutional amendment' if info['Body'] == 'H': bill_chamber = 'lower' elif info['Body'] == 'S': bill_chamber = 'upper' else: raise AssertionError("Amendment not tied to chamber") elif info['BillNumber'].startswith('H.'): bill_type = 'bill' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.'): bill_type = 'bill' bill_chamber = 'upper' else: raise AssertionError("Unknown bill type found: '{}'".format( info['BillNumber'])) # Create the bill using its basic information bill = Bill(session=session, bill_id=info['BillNumber'], title=info['Title'], chamber=bill_chamber, type=bill_type) if 'resolution' in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = \ 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\ format(year_slug, info['BillNumber']) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' 'following-sibling::dd[1]/ul/li') sponsor_type = 'primary' for sponsor in sponsors: if sponsor.xpath('span/text()') == ['Additional Sponsors']: sponsor_type = 'cosponsor' continue sponsor_name = sponsor.xpath('a/text()')[0].\ replace("Rep.", "").replace("Sen.", "").strip() if sponsor_name and not \ (sponsor_name[ :5] == "Less" and len(sponsor_name) == 5): bill.add_sponsor(sponsor_type, sponsor_name) # Capture bill text versions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' 'following-sibling::dd[1]/ul/li/a') for version in versions: bill.add_version(name=version.xpath('text()')[0], url=version.xpath('@href')[0].replace( ' ', '%20'), mimetype='application/pdf') # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/{}/(\d+)"'.format( year_slug), lxml.etree.tostring(doc)).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".\ format(info['BillNumber'])) self.save_bill(bill) continue # Capture actions actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\ format(year_slug, internal_bill_id) actions_json = self.get(actions_url).text actions = json.loads(actions_json)['data'] bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v.strip() for k, v in action.iteritems()} if "Signed by Governor" in action['FullStatus']: actor = 'governor' elif action['ChamberCode'] == 'H': actor = 'lower' elif action['ChamberCode'] == 'S': actor = 'upper' else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action['FullStatus']: assert chambers_passed == set("HS") action_type = 'governor:signed' elif actor == 'lower' and \ any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'bill:passed' chambers_passed.add("H") elif actor == 'upper' and \ any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'bill:passed' chambers_passed.add("S") else: action_type = 'other' bill.add_action(actor=actor, action=re.sub(HTML_TAGS_RE, "", action['FullStatus']), date=datetime.datetime.strptime( action['StatusDate'], '%m/%d/%Y'), type=action_type) # Capture votes votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.\ format(year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)['data'] bill.add_source(votes_url) for vote in votes: roll_call_id = vote['VoteHeaderID'] roll_call_url = 'http://legislature.vermont.gov/bill/loadBillRollCallDetails/{0}/{1}'.\ format(year_slug, roll_call_id) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)['data'] roll_call_yea = [] roll_call_nay = [] roll_call_other = [] for member in roll_call: (member_name, _district) = member['MemberName'].split(" of ") member_name = member_name.strip() if member['MemberVote'] == "Yea": roll_call_yea.append(member_name) elif member['MemberVote'] == "Nay": roll_call_nay.append(member_name) else: roll_call_other.append(member_name) if "Passed -- " in vote['FullStatus']: did_pass = True elif "Failed -- " in vote['FullStatus']: did_pass = False else: raise AssertionError("Roll call vote result is unclear") # Check vote counts yea_count = \ int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1)) nay_count = \ int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1)) vote_to_add = Vote(chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'), date=datetime.datetime.strptime( vote['StatusDate'], '%m/%d/%Y'), motion=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(), passed=did_pass, yes_count=yea_count, no_count=nay_count, other_count=len(roll_call_other)) vote_to_add.add_source(roll_call_url) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_other: vote_to_add.other(member) try: vote_to_add.validate() except ValueError as e: self.warning(e) bill.add_vote(vote_to_add) # Capture extra information # This is not in the OpenStates spec, but is available # Not yet implemented # Witnesses: http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} self.save_bill(bill)
def scrape_vote(self, bill, action_text, url): doc = lxml.html.fromstring(self.get(url).text) # process action_text - might look like "Vote - Senate Floor - Third Reading Passed (46-0) - 01/16/12" if action_text.startswith('Vote - Senate Floor - '): action_text = action_text[22:] chamber = 'upper' elif action_text.startswith('Vote - House Floor - '): action_text = action_text[21:] chamber = 'lower' motion, unused_date = action_text.rsplit(' - ', 1) try: yes_count, no_count = re.findall('\((\d+)-(\d+)\)', motion)[0] yes_count = int(yes_count) no_count = int(no_count) except IndexError: self.info( "Motion text didn't contain vote totals, will get them from elsewhere" ) yes_count = None no_count = None if 'Passed' in motion: motion = motion.split(' Passed')[0] passed = True elif 'Adopted' in motion: motion = motion.split(' Adopted')[0] passed = True elif 'Rejected' in motion: motion = motion.split(' Rejected')[0] passed = False elif 'Failed' in motion: motion = motion.split(' Failed')[0] passed = False elif 'Concur' in motion: passed = True elif 'Floor Amendment' in motion: if yes_count and no_count: passed = yes_count > no_count else: passed = None elif 'overridden' in motion: passed = True motion = 'Veto Override' else: raise Exception('unknown motion: %s' % motion) vote = Vote(chamber=chamber, date=None, motion=motion, yes_count=yes_count, no_count=no_count, other_count=None, passed=passed) vfunc = None nobrs = doc.xpath('//nobr/text()') for text in nobrs: text = text.replace(u'\xa0', ' ') if text.startswith('Calendar Date: '): if vote['date']: self.warning('two dates!, skipping rest of bill') break vote['date'] = datetime.datetime.strptime( text.split(': ', 1)[1], '%b %d, %Y %H:%M %p') elif 'Yeas' in text and 'Nays' in text and 'Not Voting' in text: yeas, nays, nv, exc, absent = re.match( '(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused \(Absent\)\s+(\d+) Absent', text).groups() vote['yes_count'] = int(yeas) vote['no_count'] = int(nays) vote['other_count'] = int(nv) + int(exc) + int(absent) elif 'Voting Yea' in text: vfunc = vote.yes elif 'Voting Nay' in text: vfunc = vote.no elif 'Not Voting' in text or 'Excused' in text: vfunc = vote.other elif vfunc: if ' and ' in text: legs = text.split(' and ') else: legs = [text] for leg in legs: # Strip the occasional asterisk - see #1512 leg = leg.rstrip('*') vfunc(leg) vote.validate() vote.add_source(url) bill.add_vote(vote)
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u'\xa0', ' ')) re_ns = "http://exslt.org/regular-expressions" path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={'re': re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if 'HOUSE' in header.xpath("string()"): chamber = 'lower' motion_index = 8 else: chamber = 'upper' motion_index = 13 motion = header.xpath( "string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r'\s+', ' ', motion) assert motion.strip(), "Motion text not found" match = re.match(r'^(.*) (PASSED|FAILED)$', motion) if match: motion = match.group(1) passed = match.group(2) == 'PASSED' else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ') rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1) date_line = rcs_p.getnext().xpath("string()") date = re.search(r'\d+/\d+/\d+', date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace('\r\n', ' ').strip() if "*****" in line: break match = re.match( r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)', line) if match: if match.group(1) == 'YEAS' and 'RCS#' not in line: vtype = 'yes' seen_yes = True elif match.group(1) == 'NAYS' and seen_yes: vtype = 'no' elif match.group(1) == 'VACANT': continue # skip these elif seen_yes: vtype = 'other' if seen_yes and match.group(3).strip(): self.logger.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(' '): if not name: continue if 'HOUSE' in name or 'SENATE ' in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts['yes'] > (counts['no'] + counts['other']) vote = Vote(chamber, date, motion, passed, counts['yes'], counts['no'], counts['other'], rcs_num=rcs) vote.validate() vote.add_source(url) for name in votes['yes']: vote.yes(name) for name in votes['no']: if ':' in name: raise Exception(name) vote.no(name) for name in votes['other']: vote.other(name) vote.validate() bill.add_vote(vote)
def scrape_floor_vote(self, chamber, bill, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') lines = text.split("\n") os.remove(path) MOTION_INDEX = 4 TOTALS_INDEX = 6 VOTE_START_INDEX = 9 motion = lines[MOTION_INDEX].strip() # Sometimes there is no motion name, only "Passage" in the line above if (not motion and not lines[MOTION_INDEX - 1].startswith("Calendar Page:")): motion = lines[MOTION_INDEX - 1] MOTION_INDEX -= 1 TOTALS_INDEX -= 1 VOTE_START_INDEX -= 1 else: assert motion, "Floor vote's motion name appears to be empty" for _extra_motion_line in range(2): MOTION_INDEX += 1 if lines[MOTION_INDEX].strip(): motion = "{}, {}".format(motion, lines[MOTION_INDEX].strip()) TOTALS_INDEX += 1 VOTE_START_INDEX += 1 else: break (yes_count, no_count, other_count) = [int(x) for x in re.search( r'^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$', lines[TOTALS_INDEX]).groups()] passed = (yes_count > no_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for line in lines[VOTE_START_INDEX:]: if not line.strip(): break if " President " in line: line = line.replace(" President ", " ") elif " Speaker " in line: line = line.replace(" Speaker ", " ") # Votes follow the pattern of: # [vote code] [member name]-[district number] for member in re.findall(r'\s*Y\s+(.*?)-\d{1,3}\s*', line): vote.yes(member) for member in re.findall(r'\s*N\s+(.*?)-\d{1,3}\s*', line): vote.no(member) for member in re.findall(r'\s*(?:EX|AV)\s+(.*?)-\d{1,3}\s*', line): vote.other(member) try: vote.validate() except ValueError: # On a rare occasion, a member won't have a vote code, # which indicates that they didn't vote. The totals reflect # this. self.logger.info("Votes don't add up; looking for additional ones") for line in lines[VOTE_START_INDEX:]: if not line.strip(): break for member in re.findall( r'\s{8,}([A-Z][a-z\'].*?)-\d{1,3}', line): vote.other(member) vote.validate() bill.add_vote(vote)
def scrape_lower_committee_votes(self, session_number, bill): ''' House committee roll calls are not available on the Senate's website. Furthermore, the House uses an internal ID system in its URLs, making accessing those pages non-trivial. This function will fetch all the House committee votes for the given bill, and add the votes to that object. ''' house_url = 'http://www.myfloridahouse.gov/Sections/Bills/bills.aspx' bill_number = ''.join([c for c in bill['bill_id'] if c.isdigit()]) form = { 'rblChamber': 'B', 'ddlSession': session_number, 'ddlBillList': '-1', 'txtBillNumber': bill_number, 'ddlSponsor': '-1', 'ddlReferredTo': '-1', 'SubmittedByControl': '', } doc = lxml.html.fromstring(self.post(url=house_url, data=form).text) doc.make_links_absolute(house_url) (bill_link, ) = doc.xpath( '//a[contains(@href, "/Bills/billsdetail.aspx?BillId=")]/@href') bill_doc = self.lxmlize(bill_link) links = bill_doc.xpath('//a[text()="See Votes"]/@href') for link in links: vote_doc = self.lxmlize(link) (date, ) = vote_doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()') date = datetime.datetime.strptime( date, '%m/%d/%Y %I:%M:%S %p').date() totals = vote_doc.xpath('//table//table')[-1].text_content() totals = re.sub(r'(?mu)\s+', " ", totals).strip() (yes_count, no_count, other_count) = [int(x) for x in re.search( r'(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+' 'Total Missed:\s+(\d+)', totals).groups()] passed = yes_count > no_count (committee, ) = vote_doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()') (action, ) = vote_doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()') motion = "{} ({})".format(action, committee) vote = Vote('lower', date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) for member_vote in vote_doc.xpath('//table//table//table//td'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath('span[2]//text()') (member_vote, ) = member_vote.xpath('span[1]//text()') if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.other(member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r'\([YN]\)', member_vote): continue else: raise IndexError("Unknown vote type found: {}".format( member_vote)) vote.validate() bill.add_vote(vote)
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.urlopen(url).replace(u"\xa0", " ")) re_ns = "http://exslt.org/regular-expressions" path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={"re": re_ns}): if "HOUSE" in header.xpath("string()"): chamber = "lower" motion_index = 8 else: chamber = "upper" motion_index = 9 motion = header.xpath("string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r"\s+", " ", motion) match = re.match(r"^(.*) (PASSED|FAILED)$", motion) if match: motion = match.group(1) passed = match.group(2) == "PASSED" else: passed = None rcs_p = header.xpath("following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ") rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1) date_line = rcs_p.getnext().xpath("string()") date = re.search(r"\d+/\d+/\d+", date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace("\r\n", " ").strip() if "*****" in line: break match = re.match( r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)", line ) if match: if match.group(1) == "YEAS" and "RCS#" not in line: vtype = "yes" seen_yes = True elif match.group(1) == "NAYS" and seen_yes: vtype = "no" elif match.group(1) == "VACANT": continue # skip these elif seen_yes: vtype = "other" counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(" "): if not name: continue if "HOUSE BILL" in name or "SENATE BILL" in name: continue votes[vtype].append(name.strip()) if passed is None: passed = counts["yes"] > (counts["no"] + counts["other"]) if not motion: motion = "Senate Vote" if chamber == "upper" else "House Vote" vote = Vote(chamber, date, motion, passed, counts["yes"], counts["no"], counts["other"], rcs_num=rcs) vote.validate() vote.add_source(url) for name in votes["yes"]: vote.yes(name) for name in votes["no"]: if ":" in name: raise Exception(name) vote.no(name) for name in votes["other"]: vote.other(name) vote.validate() bill.add_vote(vote)
def scrape_vote(self, bill, action_text, url): doc = lxml.html.fromstring(self.get(url).text) # process action_text - might look like "Vote - Senate Floor - Third Reading Passed (46-0) - 01/16/12" if action_text.startswith('Vote - Senate Floor - '): action_text = action_text[22:] chamber = 'upper' elif action_text.startswith('Vote - House Floor - '): action_text = action_text[21:] chamber = 'lower' motion, unused_date = action_text.rsplit(' - ', 1) try: yes_count, no_count = re.findall('\((\d+)-(\d+)\)', motion)[0] yes_count = int(yes_count) no_count = int(no_count) except IndexError: self.info("Motion text didn't contain vote totals, will get them from elsewhere") yes_count = None no_count = None if 'Passed' in motion: motion = motion.split(' Passed')[0] passed = True elif 'Adopted' in motion: motion = motion.split(' Adopted')[0] passed = True elif 'Rejected' in motion: motion = motion.split(' Rejected')[0] passed = False elif 'Failed' in motion: motion = motion.split(' Failed')[0] passed = False elif 'Concur' in motion: passed = True elif 'Floor Amendment' in motion: if yes_count and no_count: passed = yes_count > no_count else: passed = None elif 'overridden' in motion: passed = True motion = 'Veto Override' else: raise Exception('unknown motion: %s' % motion) vote = Vote(chamber=chamber, date=None, motion=motion, yes_count=yes_count, no_count=no_count, other_count=None, passed=passed) vfunc = None nobrs = doc.xpath('//nobr/text()') for text in nobrs: text = text.replace(u'\xa0', ' ') if text.startswith('Calendar Date: '): if vote['date']: self.warning('two dates!, skipping rest of bill') break vote['date'] = datetime.datetime.strptime(text.split(': ', 1)[1], '%b %d, %Y %H:%M %p') elif 'Yeas' in text and 'Nays' in text and 'Not Voting' in text: yeas, nays, nv, exc, absent = re.match('(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused \(Absent\)\s+(\d+) Absent', text).groups() vote['yes_count'] = int(yeas) vote['no_count'] = int(nays) vote['other_count'] = int(nv) + int(exc) + int(absent) elif 'Voting Yea' in text: vfunc = vote.yes elif 'Voting Nay' in text: vfunc = vote.no elif 'Not Voting' in text or 'Excused' in text: vfunc = vote.other elif vfunc: if ' and ' in text: a, b = text.split(' and ') vfunc(a) vfunc(b) else: vfunc(text) vote.validate() vote.add_source(url) bill.add_vote(vote)