def votes(root): for el in root.xpath(u'//p[starts-with(., "Yeas \u2014")]'): text = ''.join(el.getprevious().itertext()) m = re.search(r'(\w+ \d+) was adopted by \(Record (\d+)\): ' '(\d+) Yeas, (\d+) Nays, (\d+) Present', text) if m: yes_count = int(m.group(3)) no_count = int(m.group(4)) other_count = int(m.group(5)) vote = Vote(None, None, 'final passage', True, yes_count, no_count, other_count) vote['bill_id'] = m.group(1) vote['session'] = '81' vote['record'] = m.group(2) vote['filename'] = m.group(2) for name in names(el): vote.yes(name) el = el.getnext() if el.text and el.text.startswith('Nays'): for name in names(el): vote.no(name) el = el.getnext() while el.text and re.match(r'Present|Absent', el.text): for name in names(el): vote.other(name) el = el.getnext() vote['other_count'] = len(vote['other_votes']) yield vote else: pass
def apply_votes(self, bill): """Given a bill (and assuming it has a status_url in its dict), parse all of the votes """ bill_votes = votes.all_votes_for_url(self, bill['status_url']) for (chamber,vote_desc,pdf_url,these_votes) in bill_votes: try: date = vote_desc.split("-")[-1] except IndexError: self.warning("[%s] Couldn't get date out of [%s]" % (bill['bill_id'],vote_desc)) continue yes_votes = [] no_votes = [] other_votes = [] for voter,vote in these_votes.iteritems(): if vote == 'Y': yes_votes.append(voter) elif vote == 'N': no_votes.append(voter) else: other_votes.append(voter) passed = len(yes_votes) > len(no_votes) # not necessarily correct, but not sure where else to get it. maybe from pdf vote = Vote(standardize_chamber(chamber),date,vote_desc,passed, len(yes_votes), len(no_votes), len(other_votes),pdf_url=pdf_url) for voter in yes_votes: vote.yes(voter) for voter in no_votes: vote.no(voter) for voter in other_votes: vote.other(voter) bill.add_vote(vote)
def parse_vote_new(self, bill, chamber, url): vote_page = BeautifulSoup(self.urlopen(url)) table = vote_page.table info_row = table.findAll('tr')[1] date = info_row.td.contents[0] date = dt.datetime.strptime(date, '%m/%d/%Y') motion = info_row.findAll('td')[1].contents[0] yes_count = int(info_row.findAll('td')[2].contents[0]) no_count = int(info_row.findAll('td')[3].contents[0]) abs_count = int(info_row.findAll('td')[4].contents[0]) passed = info_row.findAll('td')[5].contents[0] == 'Pass' vote = Vote(chamber, date, motion, passed, yes_count, no_count, abs_count) vote.add_source(url) for tr in table.findAll('tr')[3:]: if len(tr.findAll('td')) != 2: continue name = tr.td.contents[0].split(' of')[0] type = tr.findAll('td')[1].contents[0] if type.startswith('Yea'): vote.yes(name) elif type.startswith('Nay'): vote.no(name) else: vote.other(name) bill.add_vote(vote)
def record_votes(root): for el in root.xpath(u'//p[starts-with(., "Yeas \u2014")]'): text = ''.join(el.getprevious().itertext()) text.replace('\n', ' ') m = re.search(r'(?P<bill_id>\w+\W+\d+)(,?\W+as\W+amended,?)?\W+was\W+' '(?P<type>adopted|passed' '(\W+to\W+(?P<to>engrossment|third\W+reading))?)\W+' 'by\W+\(Record\W+(?P<record>\d+)\):\W+' '(?P<yeas>\d+)\W+Yeas,\W+(?P<nays>\d+)\W+Nays,\W+' '(?P<present>\d+)\W+Present', text) if m: yes_count = int(m.group('yeas')) no_count = int(m.group('nays')) other_count = int(m.group('present')) bill_id = m.group('bill_id') if bill_id.startswith('H') or bill_id.startswith('CSHB'): bill_chamber = 'lower' elif bill_id.startswith('S') or bill_id.startswith('CSSB'): bill_chamber = 'upper' else: continue type = get_type(m) vote = Vote(None, None, type, True, yes_count, no_count, other_count) vote['bill_id'] = bill_id vote['bill_chamber'] = bill_chamber vote['session'] = '81' vote['method'] = 'record' vote['record'] = m.group('record') vote['filename'] = m.group('record') for name in names(el): vote.yes(name) el = el.getnext() if el.text and el.text.startswith('Nays'): for name in names(el): vote.no(name) el = el.getnext() while el.text and re.match(r'Present|Absent', el.text): for name in names(el): vote.other(name) el = el.getnext() vote['other_count'] = len(vote['other_votes']) yield vote else: pass
def scrape_old_vote(self, url): vote_page = self.soup_parser(self.urlopen(url)) header = vote_page.h3.contents[0] chamber_name = header.split(', ')[1] if chamber_name.startswith('House'): chamber = 'lower' else: chamber = 'upper' location = ' '.join(chamber_name.split(' ')[1:]) if location.startswith('of Representatives'): location = '' motion = ', '.join(header.split(', ')[2:]) def get_count(cell): if len(cell.contents) == 0: return 0 else: return int(cell.contents[0]) results_tbl = vote_page.findAll('table')[1] yes_count = get_count(results_tbl.findAll('td')[1]) no_count = get_count(results_tbl.findAll('td')[3]) excused_count = get_count(results_tbl.findAll('td')[5]) absent_count = get_count(results_tbl.findAll('td')[7]) other_count = excused_count + absent_count passed = yes_count > no_count vote = Vote(chamber, None, motion, passed, yes_count, no_count, other_count, excused_count=excused_count, absent_count=absent_count, location=location) vote.add_source(url) vote_tbl = vote_page.table for td in vote_tbl.findAll('td'): if td.contents[0] == 'Yea': vote.yes(td.findPrevious().contents[0]) elif td.contents[0] == 'Nay': vote.no(td.findPrevious().contents[0]) elif td.contents[0] in ['Excused', 'Absent']: vote.other(td.findPrevious().contents[0]) return vote
def scrape_new_vote(self, url): vote_page = self.soup_parser(self.urlopen(url)) header = vote_page.find(id="ctl00_contentMain_hdVote").contents[0] chamber_name = header.split(', ')[1] if chamber_name.startswith('House'): chamber = 'lower' else: chamber = 'upper' location = ' '.join(chamber_name.split(' ')[1:]) if location.startswith('of Representatives'): location = '' motion = ', '.join(header.split(', ')[2:]) yes_count = int(vote_page.find( id="ctl00_contentMain_tdAyes").contents[0]) no_count = int(vote_page.find( id="ctl00_contentMain_tdNays").contents[0]) excused_count = int(vote_page.find( id="ctl00_contentMain_tdExcused").contents[0]) absent_count = int(vote_page.find( id="ctl00_contentMain_tdAbsent").contents[0]) other_count = excused_count + absent_count passed = yes_count > no_count vote = Vote(chamber, None, motion, passed, yes_count, no_count, other_count, excused_count=excused_count, absent_count=absent_count, location=location) vote.add_source(url) vote_tbl = vote_page.find(id="ctl00_contentMain_tblVotes") for td in vote_tbl.findAll('td'): if td.contents[0] == 'Yea': vote.yes(td.findPrevious().contents[0]) elif td.contents[0] == 'Nay': vote.no(td.findPrevious().contents[0]) elif td.contents[0] in ['Excused', 'Absent']: vote.other(td.findPrevious().contents[0]) return vote
def parse_votes(self, url, page, chamberName, bill): # Votes for a in page.cssselect("#votes a"): vote_url = urlparse.urljoin(url, a.get("href")) vote_page = parse(vote_url).getroot() date = vote_page.cssselect("#date")[0].text yeses = int(vote_page.cssselect("#yea")[0].text) noes = int(vote_page.cssselect("#nay")[0].text) other = sum(map(lambda s: int(s.text), vote_page.cssselect("#not-voting"))) vote_obj = Vote(chamberName, date, "", yeses > noes, yeses, noes, other) for vote in vote_page.cssselect("ul.roll-call li"): rep = vote.text_content().strip() val = vote[0].text if val == "Y": vote_obj.yes(rep) elif val == "N": vote_obj.no(rep) else: vote_obj.other(rep) bill.add_vote(vote_obj)
def parse_vote(self, bill, actor, date, text, line): url = "http://leg1.state.va.us%s" % line['href'] abbr = {'S': 'upper', 'H': 'lower'} with self.soup_context(url) as vote_data: house = abbr[re.findall('\d+/\d+\/\d+\s+([\bHouse\b|\bSenate\b])', self.unescape(unicode(vote_data)))[0]] vote = Vote(house, date, text, None, 0, 0, 0) for cast in vote_data.findAll('p'): if cast.string is None: continue cleaned = cast.string.replace('\r\n', ' ') split_start = cleaned.find('--') voted = cleaned[0:split_start].strip() split_end = cleaned.find('--', split_start + 2) if split_end == -1: continue names = [] maybe_names = cleaned[split_start+2:split_end].split(", ") t_name = '' #not sure how to skip iterations, so. for i in range(len(maybe_names)): if re.match('\w\.\w\.', maybe_names[i]): names.append(t_name + ', ' + maybe_names[i]) t_name = '' else: if t_name != '': names.append(t_name) t_name = maybe_names[i] for voter in names: sanitized = voter.replace('.', '').lower() if voted=='YEAS': vote.yes(voter) elif voted=='NAYS': vote.no(voter) else: vote.other(voter.strip()) vote['other_count'] = len(vote['other_votes']) vote['yes_count'] = len(vote['yes_votes']) vote['no_count'] = len(vote['no_votes']) vote['passed'] = (vote['yes_count'] > vote['no_count']) bill.add_vote(vote)
def parse_vote(self, vote_url, chamberName): page = parse(vote_url).getroot() summary_table = filter(lambda tab: tab.text_content().find("Yeas") == 0, page.cssselect("table"))[0] vote_table = ancestor_table(summary_table) ind = table_index(vote_table, summary_table) vote_tally_table = vote_table.cssselect("table")[ind + 1] date_table = vote_table.cssselect("table")[ind - 1] counts_line = summary_table.text_content().strip().split() yeses = int(counts_line[2].strip("0") or 0) noes = int(counts_line[5].strip("0") or 0) other = int(counts_line[8].strip("0") or 0) + int(counts_line[11].strip("0") or 0) tally_counts = filter(lambda p: p != "", map(str.strip, vote_tally_table.text_content().strip().split("\n"))) if len(tally_counts[0]) > 1: tally_counts = map(lambda t: (t[:1], t[1:]), tally_counts) else: tc = [] for i in xrange(0, len(tally_counts), 2): tc.append((tally_counts[i], tally_counts[i + 1])) tally_counts = tc date_line = date_table.text_content().strip().split("\n") date = " ".join(date_line[0:2]) motion = date_line[-1] vote = Vote(chamberName, date, motion, yeses > noes, yeses, noes, other) print "tally_counts: %s" % tally_counts for tc in tally_counts: val = tc[0] rep = tc[1] if val == "Y": vote.yes(rep) elif val == "N": vote.no(rep) else: vote.other(rep) return vote
def scrape_bills(self, chamber, year): if year != "2009": raise NoDataForYear if chamber == "upper": other_chamber = "lower" bill_id = "SB 1" else: other_chamber = "upper" bill_id = "HB 1" b1 = Bill("2009-2010", chamber, bill_id, "A super bill") b1.add_source("http://example.com") b1.add_version("As Introduced", "http://example.com/SB1.html") b1.add_document("Google", "http://google.com") b1.add_sponsor("primary", "Bob Smith") b1.add_sponsor("secondary", "Johnson, Sally") d1 = datetime.datetime.strptime("1/29/2010", "%m/%d/%Y") v1 = Vote("upper", d1, "Final passage", True, 2, 0, 0) v1.yes("Bob Smith") v1.yes("Sally Johnson") d2 = datetime.datetime.strptime("1/30/2010", "%m/%d/%Y") v2 = Vote("lower", d2, "Final passage", False, 0, 1, 1) v2.no("B. Smith") v2.other("Sally Johnson") b1.add_vote(v1) b1.add_vote(v2) b1.add_action(chamber, "introduced", d1) b1.add_action(chamber, "read first time", d1) b1.add_action(other_chamber, "introduced", d2) self.save_bill(b1)
def scrape_votes(self,url,chamb): soup = BeautifulSoup(urllib2.urlopen(urllib2.Request(url)).read()) date=None motion=None yeas=None neas=None others=None passed=None chamber=chamb necessary=None vote=None fonts = soup.findAll('font') span = soup.findAll('span') if (len(fonts) + (len(span))) > 4: #data is vaguely structured if (len(fonts) < 4): fonts = span for line in fonts: #this could be sped up. line = str(line.contents[0]) line = line.strip() if line.find("Taken on") > -1: #then the text is in the form of: "Take on <date> <reason>" split = line.split(None,3) date = split[2] if (len(split) > 3): motion=split[3] elif line.find("Those voting Yea") > -1: yeas = self.get_num_from_line(line) elif line.find("Those voting Nay") > -1: neas = self.get_num_from_line(line) elif line.find("Those absent and not voting") > -1: others = self.get_num_from_line(line) elif (line.find("Necessary for Adoption") > -1) or (line.find("Necessary for Passage") > -1): necessary = self.get_num_from_line(line) if yeas >= necessary: passed = True else: passed = False vote = Vote(chamber,date,motion,passed,yeas,neas,others) #figure out who voted for what table = soup.findAll('table') tds = table[len(table)-1].findAll('td')#get the last table vote_value = None digits = re.compile('^[\d ]+$') for cell in tds: string = cell.find('font') if (string == None): string = cell.find('span') #either we are looking at fonts or spans if (string != None): string = string.contents[0] string = string.strip() else: string = '' if (len(string) > 0) and (digits.search(string) == None): if vote_value == None: if (string == 'Y') or (string == 'N'): vote_value = string elif (string == 'X') or (string == 'A'): vote_value = 'X' else: if vote_value == 'Y': vote.yes(string) elif vote_value == 'N': vote.no(string) else: vote.other(string) vote_value = None else: #data is mostly unstructured. Have to sift through a string data = soup.find('pre') lines = data.contents[len(data.contents)-1] lines = lines.strip() exp = re.compile(r'\n+|\r+|\f+') lines = exp.split(lines) names = [] for i in range(len(lines)): line = lines[i].strip() if line.find("Taken on") > -1: #then the text is in the form of: "Take on <date> <reason>" split = line.split(None,3) date = split[2] if (len(split) > 3): motion=split[3] elif line.find("Those voting Yea") > -1: yeas = self.get_num_from_line(line) elif line.find("Those voting Nay") > -1: neas = self.get_num_from_line(line) elif line.find("Those absent and not voting") > -1: others = self.get_num_from_line(line) elif (line.find("Necessary for Adoption") > -1) or (line.find("Necessary for Passage") > -1): if (line.find("Adoption") > -1): motion="Adoption" else: motion="Passage" necessary = self.get_num_from_line(line) elif (line.find("The following is the roll call vote:") > -1): break #the next lines contain actual votes #process the vote values if yeas >= necessary: passed = True else: passed = False vote = Vote(chamber,date,motion,passed,yeas,neas,others) lines = lines[i+1:] lines = string.join(lines,' ') lines = lines.split(' ') absent_vote_value = re.compile('^(X|A)$') yea_vote_value = re.compile('^Y$') nea_vote_value = re.compile('^N$') #there aren't two spaces between vote and name so it doesn't get parsed annoying_vote = re.compile('^(Y|X|A|N) ([\S ]+)$') digits = re.compile('^[\d ]+$') vote_value = None for word in lines: word = word.strip() if (len(word) > 0) and (digits.search(word) == None): word = strip_digits(word) if vote_value != None: if vote_value == 'Y': vote.yes(word) elif vote_value == 'N': vote.no(word) else: vote.other(word) vote_value = None elif absent_vote_value.match(word) != None: vote_value = 'X' elif yea_vote_value.match(word) != None: vote_value = 'Y' elif nea_vote_value.match(word) != None: vote_value = 'N' elif annoying_vote.match(word) != None: split = annoying_vote.match(word) vote_value = split.group(2) name = split.group(1) if vote_value == 'Y': vote.yes(name) elif vote_value == 'N': vote.no(name) else: vote.other(name) vote_value = None
def scrape_bills(self, chamber, year): session = "%s%d" % (year, int(year) + 1) if not session in self.metadata['sessions']: raise NoDataForYear(year) if chamber == 'upper': measure_abbr = 'SB' chamber_name = 'SENATE' house_type = 'S' else: measure_abbr = 'AB' chamber_name = 'ASSEMBLY' house_type = 'A' bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=measure_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id version = self.session.query(CABillVersion).filter_by( bill=bill).filter(CABillVersion.bill_xml != None).first() if not version: # not enough data to import continue fsbill = Bill(bill_session, chamber, bill_id, version.title, short_title=version.short_title) for author in version.authors: if author.house == chamber_name: fsbill.add_sponsor(author.contribution, author.name) for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber fsbill.add_action(actor, action.action, action.action_date) for vote in bill.votes: if vote.vote_result == '(PASS)': result = True else: result = False full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' vote_location = ' '.join(full_loc.split(' ')[1:]) else: vote_chamber = '' vote_location = full_loc fsvote = Vote(vote_chamber, vote.vote_date_time, vote.motion.motion_text or '', result, vote.ayes, vote.noes, vote.abstain, threshold=vote.threshold, location=vote_location) for record in vote.votes: if record.vote_code == 'AYE': fsvote.yes(record.legislator_name) elif record.vote_code.startswith('NO'): fsvote.no(record.legislator_name) else: fsvote.other(record.legislator_name) fsbill.add_vote(fsvote) self.add_bill(fsbill)