def scrape_bill_details(self, url, bill): html = self.get(url, retry_on_404=True).text doc = lxml.html.fromstring(html) # summary sections summary = doc.xpath('//h4[starts-with(text(), "SUMMARY")]/following-sibling::p/text()') if summary and summary[0].strip(): bill['summary'] = summary[0].strip() # versions for va in doc.xpath('//h4[text()="FULL TEXT"]/following-sibling::ul[1]/li/a[1]'): # 11/16/09 \xa0House: Prefiled and ordered printed; offered 01/13/10 10100110D date, desc = va.text.split(u' \xa0') desc.rsplit(' ', 1)[0] # chop off last part link = va.get('href') date = datetime.datetime.strptime(date, '%m/%d/%y') # budget bills in VA are searchable but no full text available if '+men+' in link: self.warning('not adding budget version, bill text not available') else: # VA duplicates reprinted bills, lets keep the original name bill.add_version(desc, BASE_URL+link, date=date, mimetype='text/html', on_duplicate='use_old') # actions for ali in doc.xpath('//h4[text()="HISTORY"]/following-sibling::ul[1]/li'): date, action = ali.text_content().split(u' \xa0') actor, action = action.split(': ', 1) actor = self.actor_map[actor] date = datetime.datetime.strptime(date.strip(), '%m/%d/%y') # if action ends in (##-Y ##-N) remove that part vrematch = self.vote_strip_re.match(action) if vrematch: action, y, n, o = vrematch.groups() vote = Vote(actor, date, action, int(y) > int(n), int(y), int(n), 0) vote_url = ali.xpath('a/@href') if vote_url: self.parse_vote(vote, vote_url[0]) vote.add_source(BASE_URL + vote_url[0]) # set other count, it isn't provided vote['other_count'] = len(vote['other_votes']) #vote.validate() bill.add_vote(vote) # categorize actions for pattern, atype in self._action_classifiers: if re.match(pattern, action): break else: atype = 'other' # if matched a 'None' atype, don't add the action if atype: bill.add_action(actor, action, date, type=atype)
def scrape_vote(self, bill, date, motion, url): page = self.urlopen(url) if 'not yet official' in page: # Sometimes they link to vote pages before they go live return page = lxml.html.fromstring(page) if url.endswith('Senate'): actor = 'upper' else: actor = 'lower' count_path = "string(//td[@align = 'center' and contains(., '%s: ')])" yes_count = int(page.xpath(count_path % "Yeas").split()[-1]) no_count = int(page.xpath(count_path % "Nays").split()[-1]) other_count = int(page.xpath(count_path % "Non Voting").split()[-1]) other_count += int(page.xpath(count_path % "Present").split()[-1]) passed = yes_count > no_count + other_count vote = Vote(actor, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) xpath = ( '//*[contains(@class, "ms-standardheader")]/' 'following-sibling::table') divs = page.xpath(xpath) votevals = 'yes no other other'.split() for (voteval, div) in zip(votevals, divs): for a in div.xpath('.//a'): getattr(vote, voteval)(a.text_content()) bill.add_vote(vote)
def add_vote(self, bill, chamber, date, line, text): votes = re.findall(r'Ayes (\d+)\, Noes (\d+)', text) (yes, no) = int(votes[0][0]), int(votes[0][1]) vtype = 'other' for regex, type in motion_classifiers.iteritems(): if re.match(regex, text): vtype = type break v = Vote(chamber, date, text, yes > no, yes, no, 0, type=vtype) # fetch the vote itself link = line.xpath('//a[contains(@href, "/votes/")]') if link: link = link[0].get('href') v.add_source(link) filename, resp = self.urlretrieve(link) if 'av' in link: self.add_house_votes(v, filename) elif 'sv' in link: self.add_senate_votes(v, filename) bill.add_vote(v)
def scrape_votes(self, link, chamber, bill): with self.urlopen(link) as votes_page_html: votes_page = lxml.html.fromstring(votes_page_html) page_tables = votes_page.cssselect("table") votes_table = page_tables[0] votes_elements = votes_table.cssselect("td") # Eliminate table headings and unnecessary element votes_elements = votes_elements[3 : len(votes_elements)] ve = grouper(5, votes_elements) for actor, date, name_and_text, name, text in ve: if "cow" in text.text_content() or "COW" in text.text_content(): continue vote_date = dt.datetime.strptime(date.text_content(), "%m/%d/%Y") motion_and_votes = text.text_content().lstrip("FINAL VOTE - ") motion, sep, votes = motion_and_votes.partition(".") if "passed" in votes: passed = True else: passed = False votes_match = re.search("([0-9]+)-([0-9]+)-?([0-9]+)?", votes) yes_count = votes_match.group(1) no_count = votes_match.group(2) other_count = votes_match.group(3) if other_count == None: other_count = 0 vote = Vote(chamber, vote_date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) bill.add_vote(vote)
def parse_vote(self, bill, action, act_chamber, act_date, url, re_vote_text = re.compile(r'The question (?:being|to be reconsidered):\s*"(.*?\?)"', re.S), re_header=re.compile(r'\d{2}-\d{2}-\d{4}\s{10,}\w{,20} Journal\s{10,}\d{,6}\s{,4}')): html = self.get(url).text doc = lxml.html.fromstring(html) if len(doc.xpath('//pre')) < 2: return # Find all chunks of text representing voting reports. votes_text_container = doc.xpath('//pre') if len(votes_text_container) < 2: return votes_text = votes_text_container[1].text_content() votes_text = re_vote_text.split(votes_text) votes_data = zip(votes_text[1::2], votes_text[2::2]) # Process each. for motion, text in votes_data: yes = no = other = 0 tally = re.findall(r'\b([YNEA])[A-Z]+:\s{,3}(\d{,3})', text) for vtype, vcount in tally: vcount = int(vcount) if vcount != '-' else 0 if vtype == 'Y': yes = vcount elif vtype == 'N': no = vcount else: other += vcount vote = Vote(act_chamber, act_date, motion, yes > no, yes, no, other) # In lengthy documents, the "header" can be repeated in the middle # of content. This regex gets rid of it. vote_lines = re_header.sub('', text) vote_lines = vote_lines.split('\r\n') vote_type = None for vote_list in vote_lines: if vote_list.startswith('Yeas: '): vote_list, vote_type = vote_list[6:], vote.yes elif vote_list.startswith('Nays: '): vote_list, vote_type = vote_list[6:], vote.no elif vote_list.startswith('Excused: '): vote_list, vote_type = vote_list[9:], vote.other elif vote_list.startswith('Absent: '): vote_list, vote_type = vote_list[9:], vote.other elif vote_list.strip() == '': vote_type = None if vote_type: for name in vote_list.split(','): name = name.strip() if name: vote_type(name) vote.add_source(url) bill.add_vote(vote)
def add_vote(self, bill, chamber, date, text, url): votes = re.findall(r'Ayes,? (\d+)[,;]\s+N(?:oes|ays),? (\d+)', text) (yes, no) = int(votes[0][0]), int(votes[0][1]) vtype = 'other' for regex, type in motion_classifiers.iteritems(): if re.match(regex, text): vtype = type break v = Vote(chamber, date, text, yes > no, yes, no, 0, type=vtype) # fetch the vote itself if url: v.add_source(url) if 'av' in url: self.add_house_votes(v, url) elif 'sv' in url: self.add_senate_votes(v, url) # other count is brute forced v['other_count'] = len(v['other_votes']) v.validate() bill.add_vote(v)
def vote(self): """Return a billy vote. """ actual_vote_dict = collections.defaultdict(list) date = self.date() motion = self.motion() passed = self.passed() counts = self.get_counts() yes_count = int(counts.get("Yeas", 0)) no_count = int(counts.get("Nays", 0)) vote = Vote( self.chamber, date, motion, passed, yes_count, no_count, sum(map(int, counts.values())) - (yes_count + no_count), actual_vote=dict(actual_vote_dict), ) for vote_val, voter in self.vote_values(): getattr(vote, vote_val)(voter) vote.add_source(self.url) return vote
def parse_senate_vote(self, sv_text, url): """Sets any overrides and creates the vote instance""" overrides = {"ONEILL": "O'NEILL"} # Add new columns as they appear to be safe vote = Vote('upper', '?', 'senate passage', False, 0, 0, 0) vote.add_source(url) vote, rowHeads, saneRow = self.parse_visual_grid(vote, sv_text, overrides, sVoteHeader, rDate, 'TOTAL', 'TOTAL') # Sanity checks on vote data, checks that the calculated total and listed totals match sane={'yes': 0, 'no': 0, 'other':0} # Make sure the header row and sanity row are in orde sorted_rh = sorted(rowHeads.items(), key=operator.itemgetter(0)) startCount=-1 for cell in saneRow: if startCount >= 0: saneVote = sorted_rh[startCount][1] if 'Y' == saneVote[0]: sane['yes'] = int(cell[0]) elif 'N' == saneVote[0]: sane['no'] = int(cell[0]) else: sane['other'] += int(cell[0]) startCount += 1 elif 'TOTAL' in cell[0]: startCount = 0 # Make sure the parsed vote totals match up with counts in the total field if sane['yes'] != vote['yes_count'] or sane['no'] != vote['no_count'] or\ sane['other'] != vote['other_count']: raise ValueError("Votes were not parsed correctly") # Make sure the date is a date if not isinstance(vote['date'], datetime): raise ValueError("Date was not parsed correctly") # End Sanity Check return vote
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': other_chamber = 'lower' bill_id = 'SB 1' else: other_chamber = 'upper' bill_id = 'HB 1' b1 = Bill(session, chamber, bill_id, 'A super bill') b1.add_source('http://example.com/') b1.add_version('As Introduced', 'http://example.com/SB1.html') b1.add_document('Google', 'http://google.com') b1.add_sponsor('primary', 'Bob Smith') b1.add_sponsor('secondary', 'Johnson, Sally') d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y') v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0) v1.yes('Smith') v1.yes('Johnson') d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y') v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1) v2.no('Bob Smith') v2.other('S. Johnson') b1.add_vote(v1) b1.add_vote(v2) b1.add_action(chamber, 'introduced', d1) b1.add_action(chamber, 'read first time', d2) b1.add_action(other_chamber, 'introduced', d2) self.save_bill(b1)
def scrape_senate_vote(self, bill, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, "text") os.remove(path) lines = text.split("\n") date_match = re.search(r"Date:\s+(\d+/\d+/\d+)", text) if not date_match: self.log("Couldn't find date on %s" % url) return time_match = re.search(r"Time:\s+(\d+:\d+:\d+)\s+(AM|PM)", text) date = "%s %s %s" % (date_match.group(1), time_match.group(1), time_match.group(2)) date = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p") date = self._tz.localize(date) vote_type = None yes_count, no_count, other_count = None, None, 0 votes = [] for line in lines[21:]: line = line.strip() if not line: continue if line.startswith("YEAS"): yes_count = int(line.split(" - ")[1]) vote_type = "yes" elif line.startswith("NAYS"): no_count = int(line.split(" - ")[1]) vote_type = "no" elif line.startswith("EXCUSED") or line.startswith("NOT VOTING"): other_count += int(line.split(" - ")[1]) vote_type = "other" else: votes.extend([(n.strip(), vote_type) for n in re.split(r"\s{2,}", line)]) if yes_count is None or no_count is None: self.log("Couldne't find vote counts in %s" % url) return passed = yes_count > no_count + other_count clean_bill_id = fix_bill_id(bill["bill_id"]) motion_line = None for i, line in enumerate(lines): if line.strip() == clean_bill_id: motion_line = i + 2 motion = lines[motion_line] if not motion: self.log("Couldn't find motion for %s" % url) return vote = Vote("upper", date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) insert_specific_votes(vote, votes) check_vote_counts(vote) bill.add_vote(vote)
def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid): votes = page.xpath("//table")[0] rows = votes.xpath(".//tr")[0] if rows[0].text_content() == 'Votes:': #New webste rows = votes.xpath(".//tr")[2] yno = rows.xpath(".//td") if len(yno) < 3: yes = yno[0] no, other = None, None else: yes, no, other = rows.xpath(".//td")[:3] def proc_block(obj, typ): if obj is None: return { "type": None, "count": None, "votes": [] } votes = [] for vote in obj.xpath(".//br"): if vote.tail: vote = vote.tail.strip() if vote: votes.append(vote) count = len(votes) return { "type": typ, "count": count, "votes": votes } vote_dict = { "yes": proc_block(yes, 'yes'), "no": proc_block(no, 'no'), "other": proc_block(other, 'other'), } yes_count = vote_dict['yes']['count'] no_count = vote_dict['no']['count'] or 0 other_count = vote_dict['other']['count'] or 0 vote = Vote( actor, date, motion, (yes_count > no_count), yes_count, no_count, other_count, _vote_id=uniqid) vote.add_source(url) for key in vote_dict: for voter in vote_dict[key]['votes']: getattr(vote, key)(voter) bill.add_vote(vote)
def scrape_vote(self, bill, action_text, url): doc = lxml.html.fromstring(self.urlopen(url)) date = None yes_count = no_count = other_count = None # process action_text - might look like "Vote - Senate Floor - Third Reading Passed (46-0) - 01/16/12" if action_text.startswith('Vote - Senate Floor - '): action_text = action_text[22:] chamber = 'upper' elif action_text.startswith('Vote - House Floor - '): action_text = action_text[21:] chamber = 'lower' motion, unused_date = action_text.split(' - ') yes_count, no_count = re.findall('\((\d+)-(\d+)\)', motion)[0] if 'Passed' in motion: motion = motion.split(' Passed')[0] passed = True elif 'Adopted' in motion: motion = motion.split(' Adopted')[0] passed = True elif 'Rejected' in motion: motion = motion.split(' Rejected')[0] passed = False elif 'Floor Amendment' in motion: passed = int(yes_count) > int(no_count) else: raise Exception('unknown motion: %s' % motion) vote = Vote(chamber=chamber, date=None, motion=motion, yes_count=int(yes_count), no_count=int(no_count), other_count=0, passed=passed) vfunc = None nobrs = doc.xpath('//nobr/text()') for text in nobrs: text = text.replace(u'\xa0', ' ') if text.startswith('Calendar Date: '): vote['date'] = datetime.datetime.strptime(text.split(': ', 1)[1], '%b %d, %Y %H:%M %p') elif 'Yeas' in text and 'Nays' in text and 'Not Voting' in text: self.debug(text) yeas, nays, nv, exc, absent = re.match('(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused \(Absent\)\s+(\d+) Absent', text).groups() vote['yes_count'] = int(yeas) vote['no_count'] = int(nays) vote['other_count'] = int(nv) + int(exc) + int(absent) elif 'Voting Yea' in text: vfunc = vote.yes elif 'Voting Nay' in text: vfunc = vote.no elif 'Not Voting' in text or 'Excused' in text: vfunc = vote.other elif vfunc: vfunc(text) vote.validate() vote.add_source(url) bill.add_vote(vote)
def scrape_votes(self, session): votes = {} last_line = [] for line in self.zf.open('tblrollcallsummary.txt'): if line.strip() == "": continue line = line.split('|') if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning('used bad vote line') else: last_line = line self.warning('bad vote line %s' % '|'.join(line)) session_yr = line[0] body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) present = int(line[7]) absent = int(line[8]) motion = line[11].strip() if session_yr == session and bill_id in self.bills_by_id: actor = 'lower' if body == 'H' else 'upper' time = datetime.datetime.strptime(timestamp, '%m/%d/%Y %H:%M:%S %p') # TODO: stop faking passed somehow passed = yeas > nays vote = Vote(actor, time, motion, passed, yeas, nays, other_count=0) votes[body+vote_num] = vote self.bills_by_id[bill_id].add_vote(vote) for line in self.zf.open('tblrollcallhistory.txt'): session_yr, body, v_num, employee, bill_id, vote = line.split('|') if session_yr == session and bill_id.strip() in self.bills_by_id: leg = self.legislators[employee]['name'] vote = vote.strip() if not body+v_num in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % ( body+v_num ) ) continue #code = self.legislators[employee]['seat'] if vote == 'Yea': votes[body+v_num].yes(leg) elif vote == 'Nay': votes[body+v_num].no(leg) else: votes[body+v_num].other(leg) votes[body+v_num]['other_count'] += 1
def scrape_current(self, chamber, term): chamber_name = "Senate" if chamber == "upper" else "House" with self.urlopen( ksapi.url + "bill_status/" ) as bill_request: # perhaps we should save this data so we can make on request for both chambers? bill_request_json = json.loads(bill_request) bills = bill_request_json["content"] for bill_data in bills: # filtering out other chambers bill_equal_chamber = False for history in bill_data["HISTORY"]: if history["chamber"] == chamber_name: bill_is_in_chamber = True if not bill_is_in_chamber: continue # main bill = Bill(term, chamber, bill_data["BILLNO"], bill_data["SHORTTITLE"]) bill.add_source(ksapi.url + "bill_status/" + bill_data["BILLNO"].lower()) if bill_data["LONGTITLE"]: bill.add_title(bill_data["LONGTITLE"]) bill.add_document("apn", ksapi.ksleg + bill_data["apn"]) bill.add_version("Latest", ksapi.ksleg + bill_data["apn"]) for sponsor in bill_data["SPONSOR_NAMES"]: bill.add_sponsor("primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor", sponsor) for event in bill_data["HISTORY"]: if "committee_names" in event and "conferee_names" in event: actor = " and ".join(bill_data["committee_names"] + bill_data["conferee_names"]) elif "committee_names" in history: actor = " and ".join(bill_data["committee_names"]) elif "conferee_names" in history: actor = " and ".join(bill_data["conferee_names"]) else: actor = "upper" if chamber == "Senate" else "lower" date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S") bill.add_action(actor, event["status"], date) if event["action_code"] in ksapi.voted: votes = votes_re.match(event["status"]) if votes: vote = Vote( chamber, date, votes.group(1), event["action_code"] in ksapi.passed, int(votes.group(2)), int(votes.group(3)), 0, ) vote.add_source(ksapi.ksleg + "bill_status/" + bill_data["BILLNO"].lower()) bill.add_vote(vote) self.save_bill(bill)
def scrape_vote(self, bill, vote_chamber, bill_id, vote_id, vote_date, action_text): url = ('http://alisondb.legislature.state.al.us/Alison/' 'GetRollCallVoteResults.aspx?' 'VOTE={0}&BODY={1}&INST={2}&SESS={3}'. format(vote_id, vote_chamber, bill_id, self.session_id)) doc = lxml.html.fromstring(self.get(url=url).text) voters = {'Y': [], 'N': [], 'P': [], 'A': []} voters_and_votes = doc.xpath('//table/tr/td/font/text()') capture_vote = False name = '' for item in voters_and_votes: if capture_vote: capture_vote = False if name: voters[item].append(name) else: capture_vote = True name = item if (name.endswith(", Vacant") or name.startswith("Total ") or not name.strip()): name = '' # Check name counts against totals listed on the site total_yea = doc.xpath('//*[starts-with(text(), "Total Yea")]/text()') if total_yea: total_yea = int(total_yea[0].split(":")[-1]) assert total_yea == len(voters['Y']), "Yea count incorrect" else: total_yea = len(voters['Y']) total_nay = doc.xpath('//*[starts-with(text(), "Total Nay")]/text()') if total_nay: total_nay = int(total_nay[0].split(":")[-1]) assert total_nay == len(voters['N']), "Nay count incorrect" else: total_nay = len(voters['N']) total_absent = doc.xpath( '//*[starts-with(text(), "Total Absent")]/text()') if total_absent: total_absent = int(total_absent[0].split(":")[-1]) assert total_absent == len(voters['A']), "Absent count incorrect" total_other = len(voters['P']) + len(voters['A']) vote = Vote( self.CHAMBERS[vote_chamber[0]], vote_date, action_text, total_yea > total_nay, total_yea, total_nay, total_other) vote.add_source(url) for member in voters['Y']: vote.yes(member) for member in voters['N']: vote.no(member) for member in (voters['A'] + voters['P']): vote.other(member) bill.add_vote(vote)
def scrape_vote(self, bill, date, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) header = page.xpath("string(//h4[contains(@id, 'hdVote')])") location = header.split(', ')[1] if location.startswith('House'): chamber = 'lower' elif location.startswith('Senate'): chamber = 'upper' else: raise ScrapeError("Bad chamber: %s" % chamber) committee = ' '.join(location.split(' ')[1:]).strip() if not committee or committee.startswith('of Representatives'): committee = None motion = ', '.join(header.split(', ')[2:]).strip() yes_count = int( page.xpath("string(//td[contains(@id, 'tdAyes')])")) no_count = int( page.xpath("string(//td[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//td[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//td[contains(@id, 'tdAbsent')])")) other_count = excused_count + absent_count passed = yes_count > no_count if motion.startswith('Do Pass'): type = 'passage' elif motion == 'Concurred in amendments': type = 'amendment' elif motion == 'Veto override': type = 'veto_override' else: type = 'other' vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote['type'] = type if committee: vote['committee'] = committee vote.add_source(url) for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"): if td.text == 'Yea': vote.yes(td.getprevious().text.strip()) elif td.text == 'Nay': vote.no(td.getprevious().text.strip()) elif td.text in ('Excused', 'Absent'): vote.other(td.getprevious().text.strip()) bill.add_vote(vote)
def scrape_vote(self, bill, vote_type_id, vote_type): base_url = "http://dcclims1.dccouncil.us/lims/voting.aspx?VoteTypeID=%s&LegID=%s" url = base_url % (vote_type_id, bill["bill_id"]) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) vote_date = convert_date(doc.get_element_by_id("VoteDate").text) # check if voice vote / approved boxes have an 'x' voice = doc.xpath('//span[@id="VoteTypeVoice"]/b/text()')[0] == "x" passed = doc.xpath('//span[@id="VoteResultApproved"]/b/text()')[0] == "x" yes_count = extract_int(doc.xpath('//span[@id="VoteCount1"]/b/text()')[0]) no_count = extract_int(doc.xpath('//span[@id="VoteCount2"]/b/text()')[0]) # every now and then this actually drops below 0 (error in count) other_count = max(13 - (yes_count + no_count), 0) vote = Vote("upper", vote_date, vote_type, passed, yes_count, no_count, other_count, voice_vote=voice) vote.add_source(url) # members are only text on page in a <u> tag for member_u in doc.xpath("//u"): member = member_u.text vote_text = member_u.xpath("../../i/text()")[0] if "Yes" in vote_text: vote.yes(member) elif "No" in vote_text: vote.no(member) else: vote.other(member) bill.add_vote(vote)
def scrape_votes(self, bill, bill_type, number, session): vote_url = ('http://www.legislature.state.oh.us/votes.cfm?ID=' + session + '_' + bill_type + '_' + str(number)) with self.urlopen(vote_url) as page: page = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) for jlink in page.xpath("//a[contains(@href, 'JournalText')]"): date = datetime.datetime.strptime(jlink.text, "%m/%d/%Y").date() details = jlink.xpath("string(../../../td[2])") chamber = details.split(" - ")[0] if chamber == 'House': chamber = 'lower' elif chamber == 'Senate': chamber = 'upper' else: raise ScrapeError("Bad chamber: %s" % chamber) motion = details.split(" - ")[1].split("\n")[0].strip() vote_row = jlink.xpath("../../..")[0].getnext() yea_div = vote_row.xpath( "td/font/div[contains(@id, 'Yea')]")[0] yeas = [] for td in yea_div.xpath("table/tr/td"): name = td.xpath("string()") if name: yeas.append(name) no_div = vote_row.xpath( "td/font/div[contains(@id, 'Nay')]")[0] nays = [] for td in no_div.xpath("table/tr/td"): name = td.xpath("string()") if name: nays.append(name) yes_count = len(yeas) no_count = len(nays) vote = Vote(chamber, date, motion, yes_count > no_count, yes_count, no_count, 0) for yes in yeas: vote.yes(yes) for no in nays: vote.no(no) bill.add_vote(vote)
def scrape_vote(self, bill, name, url): if "VOTE/H" in url: vote_chamber = "lower" cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = "upper" cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 # Connecticut's SSL is causing problems with Scrapelib, so use Requests page = requests.get(url, verify=False).text if "BUDGET ADDRESS" in page: return page = lxml.html.fromstring(page) yes_count = page.xpath("string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r"[^\d]*(\d+)[^\d]*", yes_count).group(1)) no_count = page.xpath("string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r"[^\d]*(\d+)[^\d]*", no_count).group(1)) other_count = page.xpath("string(//span[contains(., 'Those absent')])") other_count = int(re.match(r"[^\d]*(\d+)[^\d]*", other_count).group(1)) need_count = page.xpath("string(//span[contains(., 'Necessary for')])") need_count = int(re.match(r"[^\d]*(\d+)[^\d]*", need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r".*Taken\s+on\s+(\d+/\s?\d+)", date).group(1) date = date.replace(" ", "") date = datetime.datetime.strptime(date + " " + bill["session"], "%m/%d %Y").date() vote = Vote(vote_chamber, date, name, yes_count > need_count, yes_count, no_count, other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % (i + name_offset)).strip() if not name or name == "VACANT": continue if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.other(name) bill.add_vote(vote)
def scrape_vote(self, bill, date, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) header = page.xpath("string(//h4[contains(@id, 'hdVote')])") location = header.split(", ")[1] if location.startswith("House"): chamber = "lower" elif location.startswith("Senate"): chamber = "upper" else: raise ScrapeError("Bad chamber: %s" % chamber) committee = " ".join(location.split(" ")[1:]).strip() if not committee or committee.startswith("of Representatives"): committee = None motion = ", ".join(header.split(", ")[2:]).strip() if not motion: # If we can't detect a motion, skip this vote return yes_count = int(page.xpath("string(//td[contains(@id, 'tdAyes')])")) no_count = int(page.xpath("string(//td[contains(@id, 'tdNays')])")) excused_count = int(page.xpath("string(//td[contains(@id, 'tdExcused')])")) absent_count = int(page.xpath("string(//td[contains(@id, 'tdAbsent')])")) other_count = excused_count + absent_count passed = yes_count > no_count if motion.startswith("Do Pass"): type = "passage" elif motion == "Concurred in amendments": type = "amendment" elif motion == "Veto override": type = "veto_override" else: type = "other" vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote["type"] = type if committee: vote["committee"] = committee vote.add_source(url) for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"): if td.text == "Yea": vote.yes(td.getprevious().text.strip()) elif td.text == "Nay": vote.no(td.getprevious().text.strip()) elif td.text in ("Excused", "Absent"): vote.other(td.getprevious().text.strip()) bill.add_vote(vote)
def scrape_vote(self, bill, vote_url, chamber, date): page = self.lxmlize(vote_url) try: motion = page.xpath('//td/b/font[text()="MOTION:"]/../../following-sibling::td/font/text()')[0] except: self.warning("Vote Summary Page Broken ") return if 'withdrawn' not in motion: # Every table row after the one with VOTE in a td/div/b/font rolls = page.xpath('//tr[preceding-sibling::tr/td/div/b/font/text()="VOTE"]') count_row = rolls[-1] yes_count = count_row.xpath('.//b/font[normalize-space(text())="YES:"]' '/../following-sibling::font[1]/text()')[0] no_count = count_row.xpath('.//b/font[normalize-space(text())="NO:"]' '/../following-sibling::font[1]/text()')[0] exc_count = count_row.xpath('.//b/font[normalize-space(text())="EXC:"]' '/../following-sibling::font[1]/text()')[0] nv_count = count_row.xpath('.//b/font[normalize-space(text())="ABS:"]' '/../following-sibling::font[1]/text()')[0] if count_row.xpath('.//b/font[normalize-space(text())="FINAL ACTION:"]' '/../following-sibling::b[1]/font/text()'): final = count_row.xpath('.//b/font[normalize-space(text())="FINAL ACTION:"]' '/../following-sibling::b[1]/font/text()')[0] passed = True if 'pass' in final.lower() or int(yes_count) > int(no_count) else False elif 'passed without objection' in motion.lower(): passed = True yes_count = int(len(rolls[:-2])) else: self.warning("No vote breakdown found for %s" % vote_url) return other_count = int(exc_count) + int(nv_count) vote = Vote(chamber, date, motion, passed, int(yes_count), int(no_count), int(other_count)) for roll in rolls[:-2]: voter = roll.xpath('td[2]/div/font')[0].text_content() voted = roll.xpath('td[3]/div/font')[0].text_content().strip() if voted: if 'Yes' in voted: vote.yes(voter) elif 'No' in voted: vote.no(voter) else: vote.other(voter) elif 'passed without objection' in motion.lower() and voter: vote.yes(voter) bill.add_vote(vote)
def scrape_votes(self, bill, votes_url): html = self.urlopen(votes_url) doc = lxml.html.fromstring(html) doc.make_links_absolute(votes_url) EXPECTED_VOTE_CODES = ['Y','N','E','NV','A','P','-'] # vote indicator, a few spaces, a name, newline or multiple spaces VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})') for link in doc.xpath('//a[contains(@href, "votehistory")]'): pieces = link.text.split(' - ') date = pieces[-1] if len(pieces) == 3: motion = pieces[1] else: motion = 'Third Reading' chamber = link.xpath('../following-sibling::td/text()')[0] if chamber == 'HOUSE': chamber = 'lower' elif chamber == 'SENATE': chamber = 'upper' else: self.warning('unknown chamber %s' % chamber) date = datetime.datetime.strptime(date, "%A, %B %d, %Y") # download the file fname, resp = self.urlretrieve(link.get('href')) pdflines = convert_pdf(fname, 'text').splitlines() os.remove(fname) vote = Vote(chamber, date, motion.strip(), False, 0, 0, 0) for line in pdflines: for match in VOTE_RE.findall(line): vcode, name = match if vcode == 'Y': vote.yes(name) elif vcode == 'N': vote.no(name) else: vote.other(name) # fake the counts vote['yes_count'] = len(vote['yes_votes']) vote['no_count'] = len(vote['no_votes']) vote['other_count'] = len(vote['other_votes']) vote['passed'] = vote['yes_count'] > vote['no_count'] vote.add_source(link.get('href')) bill.add_vote(vote)
def parse_vote(self, bill, action, act_chamber, act_date, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) yes = no = other = 0 tally = re.findall('(?:(Y|N|E|A)(-|\d+)\s*)', action) for vtype, vcount in tally: vcount = int(vcount) if vcount != '-' else 0 if vtype == 'Y': yes = vcount elif vtype == 'N': no = vcount else: other += vcount # regex against plain html for motion try: motion = re.findall('The question being:\s*"(.*)\?"', html, re.DOTALL)[0].replace('\r\n', ' ') except IndexError: return vote = Vote(act_chamber, act_date, motion, yes > no, yes, no, other) #vote_lines = doc.xpath('//b[contains(text(), "YEAS:")]')[0].tail.split('\r\n') vote_lines = doc.xpath('//pre')[1].text_content().split('\r\n') vote_type = None for vote_list in vote_lines: if vote_list.startswith('Yeas: '): vote_list, vote_type = vote_list[6:], vote.yes elif vote_list.startswith('Nays: '): vote_list, vote_type = vote_list[6:], vote.no elif vote_list.startswith('Excused: '): vote_list, vote_type = vote_list[9:], vote.other elif vote_list.startswith('Absent: '): vote_list, vote_type = vote_list[9:], vote.other elif vote_list.strip() == '': vote_type = None if vote_type: for name in vote_list.split(','): name = name.strip() if name: vote_type(name) vote.add_source(url) bill.add_vote(vote)
def vote(self): '''Return a billy vote. ''' actual_vote_dict = collections.defaultdict(list) vote = Vote('lower', self.date(), self.motion(), self.passed(), 0, 0, 0, actual_vote=dict(actual_vote_dict)) for (vote_val, count), (actual_vote, _), text in self._parse(): vote[vote_val + '_count'] = count for name in filter(None, PlaintextColumns(text)): actual_vote_dict[actual_vote].append(name) getattr(vote, vote_val)(name) vote.add_source(self.url) return vote
def _parse_senate_votes(self, vote_data): vote_datetime = datetime.datetime.strptime(vote_data['voteDate'], '%Y-%m-%d') vote = Vote( chamber='upper', date=vote_datetime.date(), motion='[No motion available.]', passed=False, yes_votes=[], no_votes=[], other_votes=[], yes_count=0, no_count=0, other_count=0) if vote_data['voteType'] == 'FLOOR': vote['motion'] = 'Floor Vote' elif vote_data['voteType'] == 'COMMITTEE': vote['motion'] = '{} Vote'.format(vote_data['committee']['name']) else: raise ValueError('Unknown vote type encountered.') vote_rolls = vote_data['memberVotes']['items'] # Count all yea votes. if 'items' in vote_rolls.get('AYE', {}): for legislator in vote_rolls['AYE']['items']: vote.yes(legislator['fullName']) vote['yes_count'] += 1 if 'items' in vote_rolls.get('AYEWR', {}): for legislator in vote_rolls['AYEWR']['items']: vote.yes(legislator['fullName']) vote['yes_count'] += 1 # Count all nay votes. if 'items' in vote_rolls.get('NAY', {}): for legislator in vote_rolls['NAY']['items']: vote.no(legislator['fullName']) vote['no_count'] += 1 # Count all other types of votes. other_vote_types = ('EXC', 'ABS', 'ABD') for vote_type in other_vote_types: if vote_rolls.get(vote_type, []): for legislator in vote_rolls[vote_type]['items']: vote.other(legislator['fullName']) vote['other_count'] += 1 vote['passed'] = vote['yes_count'] > vote['no_count'] return vote
def scrape_vote(self, bill, chamber, url): page = self.urlopen(url) if 'There are no details available for this roll call' in page: return page = page.replace(' ', ' ') page = lxml.html.fromstring(page) info_row = page.xpath("//table[1]/tr[2]")[0] date = info_row.xpath("string(td[1])") date = datetime.datetime.strptime(date, "%m/%d/%Y") motion = info_row.xpath("string(td[2])") yes_count = int(info_row.xpath("string(td[3])")) no_count = int(info_row.xpath("string(td[4])")) other_count = int(info_row.xpath("string(td[5])")) passed = info_row.xpath("string(td[6])") == 'Pass' if motion == 'Shall the bill pass?': type = 'passage' elif motion == 'Shall the bill be read the third time?': type = 'reading:3' elif 'be amended as' in motion: type = 'amendment' else: type = 'other' vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for tr in page.xpath("//table[1]/tr")[3:]: if len(tr.xpath("td")) != 2: continue # avoid splitting duplicate names name = tr.xpath("string(td[1])").strip() if not name.startswith(DOUBLED_NAMES): name = name.split(' of')[0] type = tr.xpath("string(td[2])").strip() if type.startswith('Yea'): vote.yes(name) elif type.startswith('Nay'): vote.no(name) elif type.startswith('Not Voting'): pass else: vote.other(name) bill.add_vote(vote)
def scrape_vote(self, bill, motion, url): page = self.urlopen(url, retry_on_404=True) page = lxml.html.fromstring(page) yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0] yes_count = int(yeas_cell.xpath("string(following-sibling::td)")) nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0] no_count = int(nays_cell.xpath("string(following-sibling::td)")) abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0] abs_count = int(abs_cell.xpath("string(following-sibling::td)")) ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0] ex_count = int(ex_cell.xpath("string(following-sibling::td)")) other_count = abs_count + ex_count if 'chamber=House' in url: chamber = 'lower' elif 'chamber=Senate' in url: chamber = 'upper' date_cell = page.xpath("//td[text() = 'Date:']")[0] date = date_cell.xpath("string(following-sibling::td)") try: date = datetime.datetime.strptime(date, "%B %d, %Y") except ValueError: date = datetime.datetime.strptime(date, "%b. %d, %Y") outcome_cell = page.xpath("//td[text()='Outcome:']")[0] outcome = outcome_cell.xpath("string(following-sibling::td)") vote = Vote(chamber, date, motion, outcome == 'PREVAILS', yes_count, no_count, other_count) vote.add_source(url) member_cell = page.xpath("//td[text() = 'Member']")[0] for row in member_cell.xpath("../../tr")[1:]: name = row.xpath("string(td[2])") # name = name.split(" of ")[0] vtype = row.xpath("string(td[4])") if vtype == 'Y': vote.yes(name) elif vtype == 'N': vote.no(name) elif vtype == 'X' or vtype == 'E': vote.other(name) bill.add_vote(vote)
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( "YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)" "(.*)ABSENT( OR NOT VOTING)? -?\s?" "(\d+)(.*)", re.MULTILINE | re.DOTALL, ) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == "upper" or actor == "lower": vote_chamber = actor vote_location = "" else: vote_chamber = "" vote_location = actor vote = Vote( vote_chamber, date, motion, passed, yes_count, no_count, other_count, location=vote_location, _vote_id=uniqid, ) vote.add_source(url) yes_votes = re.split("\s{2,}", match.group(2).strip()) no_votes = re.split("\s{2,}", match.group(4).strip()) other_votes = re.split("\s{2,}", match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.other(other) bill.add_vote(vote)
def scrape_votes(self, session): votes = {} last_line = [] for line in self.zf.open("tblrollcallsummary.txt"): line = line.split("|") if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning("used bad vote line") else: last_line = line self.warning("bad vote line %s" % "|".join(line)) session_yr = line[0] body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) present = int(line[7]) absent = int(line[8]) motion = line[11].strip() if session_yr == session and bill_id in self.bills_by_id: actor = "lower" if body == "H" else "upper" time = datetime.datetime.strptime(timestamp, "%m/%d/%Y %H:%M:%S %p") # TODO: stop faking passed somehow passed = yeas > nays vote = Vote(actor, time, motion, passed, yeas, nays, absent) votes[body + vote_num] = vote self.bills_by_id[bill_id].add_vote(vote) for line in self.zf.open("tblrollcallhistory.txt"): session_yr, body, v_num, employee, bill_id, vote = line.split("|") if session_yr == session and bill_id.strip() in self.bills_by_id: leg = self.legislators[employee]["name"] vote = vote.strip() # code = self.legislators[employee]['seat'] if vote == "Yea": votes[body + v_num].yes(leg) elif vote == "Nay": votes[body + v_num].no(leg) else: votes[body + v_num].other(leg)
def parse_vote(self, actor, date, row): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath('.//span') motion = row.text.replace(u'\u00a0', " ").replace("-", "").strip() motion = motion if motion else "passage" passed, yes_count, no_count, other_count = spans[0].text_content().rsplit('-', 3) yes_votes = self.get_names(spans[1].tail) no_votes = self.get_names(spans[2].tail) other_votes = [] for span in spans[3:]: if span.text.startswith(('Absent', 'Excused')): other_votes += self.get_names(span.tail) for key, val in {'adopted': True, 'passed': True, 'failed': False}.items(): if key in passed.lower(): passed = val break vote = Vote(actor, date, motion, passed, int(yes_count), int(no_count), int(other_count)) for name in yes_votes: if name and name != 'None': vote.yes(name) for name in no_votes: if name and name != 'None': vote.no(name) for name in other_votes: if name and name != 'None': vote.other(name) return vote
def parse_senate_vote(self, url): """ senate PDFs -> garbled text -> good text -> Vote """ vote = Vote('upper', '?', 'senate passage', False, 0, 0, 0) vote.add_source(url) fname, resp = self.urlretrieve(url) # this gives us the cleaned up text sv_text = convert_sv_text(convert_pdf(fname, 'text')) os.remove(fname) in_votes = False # use in_votes as a sort of state machine for line in sv_text: # not 'in_votes', get date or passage if not in_votes: dmatch = re.search('DATE:(\d{2}-\d{2}-\d{2})', line) if dmatch: date = dmatch.groups()[0] vote['date'] = datetime.strptime(date, '%m-%d-%y') if 'YES NO ABS EXC' in line: in_votes = True elif 'PASSED' in line: vote['passed'] = True # in_votes: totals & votes else: # totals if 'TOTALS' in line: # Lt. Governor voted if 'GOVERNOR' in line: name, spaces, line = re.match(' ([A-Z,.]+)(\s+)X(.*)', line).groups() if len(spaces) == 1: vote.yes(name) else: vote.no(name) _, yes, no, abs, exc = line.split() vote['yes_count'] = int(yes) vote['no_count'] = int(no) vote['other_count'] = int(abs) + int(exc) # no longer in votes in_votes = False continue # pull votes out matches = re.match(' ([A-Z,.]+)(\s+)X\s+([A-Z,.]+)(\s+)X', line).groups() name1, spaces1, name2, spaces2 = matches # vote can be determined by # of spaces if len(spaces1) == 1: vote.yes(name1) elif len(spaces1) == 2: vote.no(name1) else: vote.other(name1) if len(spaces2) == 1: vote.yes(name2) elif len(spaces2) == 2: vote.no(name2) else: vote.other(name2) return vote
def scrape_votes(self, bill, link): page = self.get(link).text page = lxml.html.fromstring(page) raw_vote_data = page.xpath( "//span[@id='lblVoteData']")[0].text_content() raw_vote_data = re.split('\w+? by [\w ]+?\s+-', raw_vote_data.strip())[1:] for raw_vote in raw_vote_data: raw_vote = raw_vote.split( u'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0') motion = raw_vote[0] vote_date = re.search('(\d+/\d+/\d+)', motion) if vote_date: vote_date = datetime.datetime.strptime(vote_date.group(), '%m/%d/%Y') passed = ('Passed' in motion or 'Recommended for passage' in motion or 'Adopted' in raw_vote[1]) vote_regex = re.compile('\d+$') aye_regex = re.compile('^.+voting aye were: (.+) -') no_regex = re.compile('^.+voting no were: (.+) -') other_regex = re.compile('^.+present and not voting were: (.+) -') yes_count = 0 no_count = 0 other_count = 0 ayes = [] nos = [] others = [] for v in raw_vote[1:]: v = v.strip() if v.startswith('Ayes...') and vote_regex.search(v): yes_count = int(vote_regex.search(v).group()) elif v.startswith('Noes...') and vote_regex.search(v): no_count = int(vote_regex.search(v).group()) elif v.startswith( 'Present and not voting...') and vote_regex.search(v): other_count += int(vote_regex.search(v).group()) elif aye_regex.search(v): ayes = aye_regex.search(v).groups()[0].split(', ') elif no_regex.search(v): nos = no_regex.search(v).groups()[0].split(', ') elif other_regex.search(v): others += other_regex.search(v).groups()[0].split(', ') if 'ChamberVoting=H' in link: chamber = 'lower' else: chamber = 'upper' vote = Vote(chamber, vote_date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) seen = set() for a in ayes: if a in seen: continue vote.yes(a) seen.add(a) for n in nos: if n in seen: continue vote.no(n) seen.add(n) for o in others: if o in seen: continue vote.other(o) seen.add(o) # vote.validate() bill.add_vote(vote) return bill
def _process_votes(self,rollcalls,bill,proxy): result_types = { 'FAILED': False, 'DEFEATED': False, 'PREVAILED': True, 'PASSED': True, 'SUSTAINED': True, 'NOT SECONDED': False, 'OVERRIDDEN': True, 'ADOPTED': True, } for r in rollcalls: proxy_link = proxy["url"] + r["link"] (path, resp) = self.urlretrieve(proxy_link) text = convert_pdf(path, 'text') lines = text.split("\n") os.remove(path) chamber = "lower" if "house of representatives" in lines[0].lower() else "upper" date_parts = lines[1].strip().split()[-3:] date_str = " ".join(date_parts).title() + " " + lines[2].strip() vote_date = datetime.datetime.strptime(date_str,"%b %d, %Y %I:%M:%S %p") passed = None for res,val in result_types.items(): # We check multiple lines now because the result of the # roll call vote as parsed can potentially be split. # PDF documents suck. for line in lines[3:5]: if res in line.upper(): passed = val break if passed is None: raise AssertionError("Missing bill passage type") motion = " ".join(lines[4].split()[:-2]) try: yeas = int(lines[4].split()[-1]) nays = int(lines[5].split()[-1]) excused = int(lines[6].split()[-1]) not_voting = int(lines[7].split()[-1]) except ValueError: self.logger.warning("Vote format is weird, skipping") continue other_count = excused + not_voting vote = Vote(chamber,vote_date,motion,passed,yeas,nays,other_count,yes_votes=[],no_votes=[],other_votes=[]) vote.add_source(proxy_link) currently_counting = "" possible_vote_lines = lines[8:] for l in possible_vote_lines: l = l.replace("NOT\xc2\xa0VOTING","NOT VOTING") l = l.replace("\xc2\xa0"," -") if "yea-" in l.lower().replace(" ",""): currently_counting = "yes_votes" elif "nay-" in l.lower().replace(" ",""): currently_counting = "no_votes" elif "excused-" in l.lower().replace(" ",""): currently_counting = "other_votes" elif "notvoting-" in l.lower().replace(" ",""): currently_counting = "other_votes" elif currently_counting == "": pass elif re.search(r'v\. \d\.\d',l): #this gets rid of the version number #which is often found at the bottom of the doc pass else: voters = l.split(" ") for v in voters: if v.strip(): vote[currently_counting].append(v.strip()) if len(vote["yes_votes"]) == vote["yes_count"]: self.logger.warning("Yes vote counts ({count}) don't match count of actual votes ({actual}): {url}".format(count=vote["yes_count"],actual=len(vote["yes_votes"]), url=proxy_link)) if len(vote["no_votes"]) == vote["no_count"]: self.logger.warning("No vote counts ({count}) don't match count of actual votes ({actual}): {url}".format(count=vote["no_count"],actual=len(vote["no_votes"]), url=proxy_link)) if len(vote["other_votes"]) == vote["other_count"]: self.logger.warning("Other vote counts ({count}) don't match count of actual votes ({actual}): {url}".format(count=vote["other_count"],actual=len(vote["other_votes"]),url=proxy_link)) #indiana only has simple majorities even for veto overrides #if passage status isn't the same as yes>no, then we should look! bill_type = bill['type'][0] vote_invalid = False # It seems resolutions may be passed without a recorded vote. # Don't understand why there's a roll call then, but hey. if 'resolution' in bill_type: if vote['passed'] != (vote['yes_count'] >= vote['no_count']): vote_invalid = True else: if vote['passed'] != (vote['yes_count'] > vote['no_count']): vote_invalid = True if vote_invalid: raise AssertionError('Vote count doesn\'t agree with vote ' 'passage status.') bill.add_vote(vote)
def scrape(self, chamber, session): chamber_name = 'house' if chamber == 'lower' else 'senate' session_slug = { '62': '62-2011', '63': '63-2013', '64': '64-2015', '65': '65-2017', }[session] # Open the index page of the session's Registers, and open each url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % ( session_slug, chamber_name) page = self.lxmlize(url) pdfs = page.xpath("//a[contains(@href, '.pdf')]") for pdf in pdfs: # Initialize information about the vote parsing results = {} in_motion = False cur_vote = None in_vote = False cur_motion = "" # Determine which URLs the information was pulled from pdf_url = pdf.attrib['href'] try: (path, response) = self.urlretrieve(pdf_url) except requests.exceptions.ConnectionError: continue # Convert the PDF to text data = convert_pdf(path, type='text') os.unlink(path) # Determine the date of the document date = re.findall(date_re, data) if date: date = date[0][0] cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y") else: # If no date is found anywhere, do not process the document self.warning("No date was found for the document; skipping.") continue # Check each line of the text for motion and vote information lines = data.splitlines() for line in lines: # Ignore lines with no information if re.search(chamber_re, line) or \ re.search(date_re, line) or \ re.search(page_re, line) or \ line.strip() == "": pass # Ensure that motion and vote capturing are not _both_ active elif in_motion and in_vote: raise AssertionError( "Scraper should not be simultaneously processing " + "motion name and votes, as it is for this motion: " + cur_motion ) # Start capturing motion text after a ROLL CALL header elif not in_motion and not in_vote: if line.strip() == "ROLL CALL": in_motion = True elif in_motion and not in_vote: if cur_motion == "": cur_motion = line.strip() else: cur_motion = cur_motion + " " + line.strip() # ABSENT AND NOT VOTING marks the end of each motion name # In this case, prepare to capture votes if line.strip().endswith("VOTING") or \ line.strip().endswith("VOTING."): in_motion = False in_vote = True elif not in_motion and in_vote: # Ignore appointments and confirmations if "The Senate advises and consents to the appointment" \ in line: in_vote = False cur_vote = None results = {} cur_motion = "" # If votes are being processed, record the voting members elif ":" in line: cur_vote, who = (x.strip() for x in line.split(":", 1)) who = [x.strip() for x in who.split(';') if x.strip() != ""] results[cur_vote] = who name_may_be_continued = False if line.endswith(";") \ else True elif cur_vote is not None and \ not any(x in line.lower() for x in ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']): who = [x.strip() for x in line.split(";") if x.strip() != ""] if name_may_be_continued: results[cur_vote][-1] = results[cur_vote][-1] + \ " " + who.pop(0) name_may_be_continued = False if line.endswith(";") \ else True results[cur_vote].extend(who) # At the conclusion of a vote, save its data elif any(x in line.lower() for x in ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']): in_vote = False cur_vote = None # Identify what is being voted on # Throw a warning if impropper informaiton found bills = re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line) if bills == [] or cur_motion.strip() == "": results = {} cur_motion = "" self.warning( "No motion or bill name found: " + "motion name: " + cur_motion + "; " + "decision text: " + line.strip() ) continue cur_bill_id = "%s%s%s %s" % (bills[-1]) # If votes are found in the motion name, throw an error if "YEAS:" in cur_motion or "NAYS:" in cur_motion: raise AssertionError( "Vote data found in motion name: " + cur_motion ) # Use the collected results to determine who voted how keys = { "YEAS": "yes", "NAYS": "no", "ABSENT AND NOT VOTING": "other" } res = {} for key in keys: if key in results: res[keys[key]] = filter(lambda a: a != "", results[key]) else: res[keys[key]] = [] # Count the number of members voting each way yes, no, other = \ len(res['yes']), \ len(res['no']), \ len(res['other']) chambers = { "H": "lower", "S": "upper", "J": "joint" } # Identify the source chamber for the bill try: bc = chambers[cur_bill_id[0]] except KeyError: bc = 'other' # Determine whether or not the vote passed if "over the governor's veto" in cur_motion.lower(): VETO_SUPERMAJORITY = 2 / 3 passed = (yes / (yes + no) > VETO_SUPERMAJORITY) else: passed = (yes > no) # Create a Vote object based on the scraped information vote = Vote(chamber, cur_date, cur_motion, passed, yes, no, other, session=session, bill_id=cur_bill_id, bill_chamber=bc) vote.add_source(pdf_url) vote.add_source(url) # For each category of voting members, # add the individuals to the Vote object for key in res: obj = getattr(vote, key) for person in res[key]: obj(person) # Check the vote counts in the motion text against # the parsed results for category_name in keys.keys(): # Need to search for the singular, not plural, in the text # so it can find, for example, " 1 NAY " vote_re = r"(\d+)\s{}".format(category_name[:-1]) motion_count = int(re.findall(vote_re, cur_motion)[0]) vote_count = vote[keys[category_name] + "_count"] if motion_count != vote_count: self.warning( "Motion text vote counts ({}) ".format(motion_count) + "differed from roll call counts ({}) ".format(vote_count) + "for {0} on {1}".format(category_name, cur_bill_id) ) vote[keys[category_name] + "_count"] = motion_count self.save_vote(vote) # With the vote successfully processed, # wipe its data and continue to the next one results = {} cur_motion = ""
def asvote(self): v = Vote(**self.asdict()) for key in 'yes_votes no_votes other_votes'.split(): v[key] = getattr(self, key)() v.add_source(self.url) return v
def parse_html_vote(self, bill, actor, date, motion, url, uniqid): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) descr = page.xpath("//b")[0].text_content() if "on voice vote" in descr: return if "committee" in descr.lower(): return self.scrape_committee_vote(bill, actor, date, motion, url, uniqid) passed = None if "Passed" in descr: passed = True elif "Failed" in descr: passed = False elif "UTAH STATE LEGISLATURE" in descr: return else: logger.warning(descr) raise NotImplemented("Can't see if we passed or failed") headings = page.xpath("//b")[1:] votes = page.xpath("//table") sets = zip(headings, votes) vdict = {} for (typ, votes) in sets: txt = typ.text_content() arr = [x.strip() for x in txt.split("-", 1)] if len(arr) != 2: continue v_txt, count = arr v_txt = v_txt.strip() count = int(count) people = [ x.text_content().strip() for x in votes.xpath(".//font[@face='Arial']") ] vdict[v_txt] = {"count": count, "people": people} vote = Vote(actor, date, motion, passed, vdict['Yeas']['count'], vdict['Nays']['count'], vdict['Absent or not voting']['count'], _vote_id=uniqid) vote.add_source(url) for person in vdict['Yeas']['people']: vote.yes(person) for person in vdict['Nays']['people']: vote.no(person) for person in vdict['Absent or not voting']['people']: vote.other(person) logger.info(vote) bill.add_vote(vote)
def scrape_house_vote(self, bill, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') os.remove(path) lines = text.split('\n') try: date = re.search(r'\d\d-\d\d-\d\d', text).group(0) except AttributeError: self.log("Couldn't find date on %s" % url) return date = datetime.datetime.strptime(date, "%m-%d-%y") votes = [] yes_count, no_count, other_count = None, None, 0 vtype = None for line in lines[14:]: line = line.strip() if not line: continue if line.startswith('VOTING YEA'): yes_count = parse_vote_count(line.split(":")[1].strip()) vtype = 'yes' elif line.startswith('VOTING NAY'): no_count = parse_vote_count(line.split(":")[1].strip()) vtype = 'no' elif line.startswith('EXCUSED'): other_count += parse_vote_count(line.split(":")[1].strip()) vtype = 'other' elif line.startswith('NOT VOTING'): other_count += parse_vote_count(line.split(":")[1].strip()) vtype = 'other' else: n1 = line[0:19].strip() if n1: votes.append((n1, vtype)) n2 = line[19:40].strip() if n2: votes.append((n2, vtype)) n3 = line[40:58].strip() if n3: votes.append((n3, vtype)) n4 = line[58:].strip() if n4: votes.append((n4, vtype)) result_types = { 'FAILED': False, 'DEFEATED': False, 'PREVAILED': True, 'PASSED': True, 'SUSTAINED': True } passed = re.search( r'Roll\s+Call\s+\d+:\s+(%s)' % '|'.join(result_types.keys()), text).group(1) passed = result_types[passed] motion_line = None for i, line in enumerate(lines): if line.startswith('MEETING DAY'): motion_line = i + 7 motion = re.split(r'\s{2,}', lines[motion_line].strip())[0].strip() if not motion: self.log("Couldn't find motion for %s" % url) return vote = Vote('lower', date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) insert_specific_votes(vote, votes) check_vote_counts(vote) bill.add_vote(vote)
def scrape_uppper_committee_vote(self, bill, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') lines = text.split("\n") os.remove(path) (_, motion) = lines[5].split("FINAL ACTION:") motion = motion.strip() if not motion: self.warning("Vote appears to be empty") return vote_top_row = [ lines.index(x) for x in lines if re.search(r'^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$', x) ][0] yea_columns_end = lines[vote_top_row].index("Yea") + len("Yea") nay_columns_begin = lines[vote_top_row].index("Nay") votes = {'yes': [], 'no': [], 'other': []} for line in lines[(vote_top_row + 1):]: if line.strip(): member = re.search( r'''(?x) ^\s+(?:[A-Z\-]+)?\s+ # Possible vote indicator ([A-Z][a-z]+ # Name must have lower-case characters [\w\-\s]+) # Continue looking for the rest of the name (?:,[A-Z\s]+?)? # Leadership has an all-caps title (?:\s{2,}.*)? # Name ends when many spaces are seen ''', line).group(1) # Usually non-voting members won't even have a code listed # Only a couple of codes indicate an actual vote: # "VA" (vote after roll call) and "VC" (vote change) did_vote = bool(re.search(r'^\s+(X|VA|VC)\s+[A-Z][a-z]', line)) if did_vote: # Check where the "X" or vote code is on the page vote_column = len(line) - len(line.lstrip()) if vote_column <= yea_columns_end: votes['yes'].append(member) elif vote_column >= nay_columns_begin: votes['no'].append(member) else: raise AssertionError( "Unparseable vote found for {0} in {1}:\n{2}". format(member, url, line)) else: votes['other'].append(member) # End loop as soon as no more members are found else: break totals = re.search(r'(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS', text).groups() yes_count = int(totals[0]) no_count = int(totals[1]) passed = (yes_count > no_count) other_count = len(votes['other']) vote = Vote('upper', date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) vote['yes_votes'] = votes['yes'] vote['no_votes'] = votes['no'] vote['other_votes'] = votes['other'] vote.validate() bill.add_vote(vote)
def scrape_journal(self, session, url): journal, resp = self.urlretrieve(url) text = convert_pdf(journal, type='text') lines = text.splitlines() # state machine: # None - undefined state # question_quote - in question, looking for end quote # pre-yes - vote is active, haven't hit yes votes yet # yes - yes votes # no - no votes # other - other votes state = None vote = None for line_num, line in enumerate(lines): date_match = DATE_RE.findall(line) # skip headers if 'LEGISLATIVE JOURNAL' in line: continue elif date_match: date = datetime.datetime.strptime(' '.join(date_match[0]), '%B %d %Y') continue # keep adding lines to question while quotes are open elif state == 'question_quote': question += ' %s' % line elif state in ('pre-yes', 'yes', 'no', 'other'): yes_match = YES_RE.match(line) no_match = NO_RE.match(line) other_match = NOT_VOTING_RE.match(line) if yes_match: vote['yes_count'] = int(yes_match.group(1)) state = 'yes' elif no_match: vote['no_count'] = int(no_match.group(1)) state = 'no' elif other_match: vote['other_count'] += int(other_match.group(1)) state = 'other' elif 'having voted in the affirmative' in line: vote['passed'] = True state = None vote.validate() self.save_vote(vote) vote = None elif 'Having failed' in line: vote['passed'] = False state = None vote.validate() self.save_vote(vote) vote = None elif line: people = re.split('\s{3,}', line) #try: func = { 'yes': vote.yes, 'no': vote.no, 'other': vote.other }[state] #except KeyError: #self.warning('line showed up in pre-yes state: %s', # line) for p in people: if p: # special case for long name w/ 1 space if p.startswith(('Lautenbaugh ', 'Langemeier ')): p1, p2 = p.split(' ', 1) func(p1) func(p2) else: func(p) # check the text against our regexes bill_match = BILL_RE.match(line) veto_match = VETO_BILL_RE.findall(line) question_match = QUESTION_RE.findall(line) if bill_match: bill_type, bill_id = bill_match.groups() if bill_type == 'BILL': bill_id = 'LB ' + bill_id elif bill_type == 'RESOLUTION': bill_id = 'LR ' + bill_id elif question_match: question = question_match[0] state = 'question_quote' elif veto_match: bill_id = veto_match[0] # line just finished a question if state == 'question_quote' and QUESTION_MATCH_END in question: question = re.sub( '\s+', ' ', question.replace(QUESTION_MATCH_END, '').strip()) # save prior vote vote = Vote(bill_id=bill_id, session=session, bill_chamber='upper', chamber='upper', motion=question, type='passage', passed=False, date=date, yes_count=0, no_count=0, other_count=0) vote.add_source(url) state = 'pre-yes' # reset bill_id and question bill_id = question = None
def parse_vote(self, bill, vote_date, vote_chamber, vote_status, vote_url): vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' formats = ['%a %d %b %Y', '%b. %d, %Y, %H:%M %p', '%B %d, %Y, %H:%M %p', '%B %d, %Y, %H %p', '%a, %b %d, %Y' ] vote_date = vote_date.replace('.m.', 'm') for format in formats: try: vote_date = datetime.datetime.strptime(vote_date, format) break except ValueError: pass else: raise ValueError("couldn't parse date: " + vote_date) vote_doc, resp = self.urlretrieve(vote_url) try: subprocess.check_call('timeout 10 abiword --to=ksvote.txt %s' % vote_doc, shell=True, cwd='/tmp/') except subprocess.CalledProcessError: # timeout failed, some documents hang abiword self.error('abiword hung for longer than 10s on conversion') return vote_lines = open('/tmp/ksvote.txt').readlines() os.remove(vote_doc) comma_or_and = re.compile(', |\sand\s') comma_or_and_jrsr = re.compile(', (?!Sr.|Jr.)|\sand\s') vote = None passed = True for line in vote_lines: totals = re.findall('Yeas (\d+)[;,] Nays (\d+)[;,] (?:Present but not voting|Present and Passing):? (\d+)[;,] (?:Absent or not voting|Absent or Not Voting):? (\d+)', line) line = line.strip() if totals: totals = totals[0] yeas = int(totals[0]) nays = int(totals[1]) nv = int(totals[2]) absent = int(totals[3]) # default passed to true vote = Vote(vote_chamber, vote_date, vote_status.strip(), True, yeas, nays, nv+absent) elif vote and line.startswith('Yeas:'): line = line.split(':', 1)[1].strip() for member in comma_or_and.split(line): if member != 'None.': vote.yes(member) elif vote and line.startswith('Nays:'): line = line.split(':', 1)[1].strip() # slightly different vote format if Jr stands alone on a line if ', Jr.,' in line: regex = comma_or_and_jrsr else: regex = comma_or_and for member in regex.split(line): if member != 'None.': vote.no(member) elif vote and line.startswith('Present '): line = line.split(':', 1)[1].strip() for member in comma_or_and.split(line): if member != 'None.': vote.other(member) elif vote and line.startswith('Absent or'): line = line.split(':', 1)[1].strip() for member in comma_or_and.split(line): if member != 'None.': vote.other(member) elif 'the motion did not prevail' in line: passed = False if vote: vote['passed'] = passed vote.add_source(vote_url) bill.add_vote(vote)
def scrape_floor_vote(self, chamber, bill, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') lines = text.split("\n") os.remove(path) MOTION_INDEX = 4 TOTALS_INDEX = 6 VOTE_START_INDEX = 9 motion = lines[MOTION_INDEX].strip() # Sometimes there is no motion name, only "Passage" in the line above if (not motion and not lines[MOTION_INDEX - 1].startswith("Calendar Page:")): motion = lines[MOTION_INDEX - 1] MOTION_INDEX -= 1 TOTALS_INDEX -= 1 VOTE_START_INDEX -= 1 else: assert motion, "Floor vote's motion name appears to be empty" for _extra_motion_line in range(2): MOTION_INDEX += 1 if lines[MOTION_INDEX].strip(): motion = "{}, {}".format(motion, lines[MOTION_INDEX].strip()) TOTALS_INDEX += 1 VOTE_START_INDEX += 1 else: break (yes_count, no_count, other_count) = [ int(x) for x in re.search( r'^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$', lines[TOTALS_INDEX]).groups() ] passed = (yes_count > no_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for line in lines[VOTE_START_INDEX:]: if not line.strip(): break if " President " in line: line = line.replace(" President ", " ") elif " Speaker " in line: line = line.replace(" Speaker ", " ") # Votes follow the pattern of: # [vote code] [member name]-[district number] for member in re.findall(r'\s*Y\s+(.*?)-\d{1,3}\s*', line): vote.yes(member) for member in re.findall(r'\s*N\s+(.*?)-\d{1,3}\s*', line): vote.no(member) for member in re.findall(r'\s*(?:EX|AV)\s+(.*?)-\d{1,3}\s*', line): vote.other(member) try: vote.validate() except ValueError: # On a rare occasion, a member won't have a vote code, # which indicates that they didn't vote. The totals reflect # this. self.logger.info("Votes don't add up; looking for additional ones") for line in lines[VOTE_START_INDEX:]: if not line.strip(): break for member in re.findall(r'\s{8,}([A-Z][a-z\'].*?)-\d{1,3}', line): vote.other(member) vote.validate() bill.add_vote(vote)
def scrape_lower_committee_votes(self, session_number, bill): ''' House committee roll calls are not available on the Senate's website. Furthermore, the House uses an internal ID system in its URLs, making accessing those pages non-trivial. This function will fetch all the House committee votes for the given bill, and add the votes to that object. ''' house_url = 'http://www.myfloridahouse.gov/Sections/Bills/bills.aspx' # Keep the digits and all following characters in the bill's ID bill_number = re.search(r'^\w+\s(\d+\w*)$', bill['bill_id']).group(1) form = { 'rblChamber': 'B', 'ddlSession': session_number, 'ddlBillList': '-1', 'txtBillNumber': bill_number, 'ddlSponsor': '-1', 'ddlReferredTo': '-1', 'SubmittedByControl': '', } doc = lxml.html.fromstring(self.post(url=house_url, data=form).text) doc.make_links_absolute(house_url) (bill_link, ) = doc.xpath( '//a[contains(@href, "/Bills/billsdetail.aspx?BillId=")]/@href') bill_doc = self.lxmlize(bill_link) links = bill_doc.xpath('//a[text()="See Votes"]/@href') for link in links: vote_doc = self.lxmlize(link) (date, ) = vote_doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()') date = datetime.datetime.strptime(date, '%m/%d/%Y %I:%M:%S %p').date() totals = vote_doc.xpath('//table//table')[-1].text_content() totals = re.sub(r'(?mu)\s+', " ", totals).strip() (yes_count, no_count, other_count) = [ int(x) for x in re.search( r'(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+' 'Total Missed:\s+(\d+)', totals).groups() ] passed = yes_count > no_count (committee, ) = vote_doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()') (action, ) = vote_doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()') motion = "{} ({})".format(action, committee) vote = Vote('lower', date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) for member_vote in vote_doc.xpath('//table//table//table//td'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath('span[2]//text()') (member_vote, ) = member_vote.xpath('span[1]//text()') if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.other(member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r'\([YN]\)', member_vote): continue else: raise IndexError( "Unknown vote type found: {}".format(member_vote)) vote.validate() bill.add_vote(vote)
def parse_html_vote(self, bill, actor, date, motion, url, uniqid): try: page = self.get(url).text except scrapelib.HTTPError: self.warning("A vote page not found for bill {}".format( bill['bill_id'])) return page = lxml.html.fromstring(page) page.make_links_absolute(url) descr = page.xpath("//b")[0].text_content() if descr == '': #New page method descr = page.xpath("//div[@id='content']/center")[0].text if "on voice vote" in descr: return if "committee" in descr.lower(): return self.scrape_committee_vote(bill, actor, date, motion, page, url, uniqid) passed = None if "Passed" in descr: passed = True elif "Failed" in descr: passed = False elif "UTAH STATE LEGISLATURE" in descr: return else: self.warning(descr) raise NotImplementedError("Can't see if we passed or failed") headings = page.xpath("//b")[1:] votes = page.xpath("//table") sets = zip(headings, votes) vdict = {} for (typ, votes) in sets: txt = typ.text_content() arr = [x.strip() for x in txt.split("-", 1)] if len(arr) != 2: continue v_txt, count = arr v_txt = v_txt.strip() count = int(count) people = [ x.text_content().strip() for x in votes.xpath(".//font[@face='Arial']") ] vdict[v_txt] = {"count": count, "people": people} vote = Vote(actor, date, motion, passed, vdict['Yeas']['count'], vdict['Nays']['count'], vdict['Absent or not voting']['count'], _vote_id=uniqid) vote.add_source(url) for person in vdict['Yeas']['people']: vote.yes(person) for person in vdict['Nays']['people']: vote.no(person) for person in vdict['Absent or not voting']['people']: vote.other(person) self.info("Adding vote to bill") bill.add_vote(vote)
def scrape_digest(self, bill): digest_url = 'http://legisweb.state.wy.us/%(session)s/Digest/%(bill_id)s.htm' % bill bill.add_source(digest_url) try: html = self.urlopen(digest_url) except scrapelib.HTTPError: self.warning('no digest for %s' % bill['bill_id']) return doc = lxml.html.fromstring(html) ext_title = doc.xpath('//span[@class="billtitle"]') if ext_title: bill['extended_title'] = ext_title[0].text_content().replace( '\r\n', ' ') sponsor_span = doc.xpath('//span[@class="sponsors"]') sponsors = '' if sponsor_span: sponsors = sponsor_span[0].text_content().replace('\r\n', ' ') else: for p in doc.xpath('//p'): if p.text_content().lower().startswith('sponsored by'): sponsors = p.text_content().replace('\r\n', ' ') if sponsors: if 'Committee' in sponsors: bill.add_sponsor('sponsor', sponsors) else: if bill['chamber'] == 'lower': sp_lists = sponsors.split('and Senator(s)') else: sp_lists = sponsors.split('and Representative(s)') for spl in sp_lists: for sponsor in split_names(spl): bill.add_sponsor('sponsor', sponsor) action_re = re.compile('(\d{1,2}/\d{1,2}/\d{4})\s+(H |S )?(.+)') vote_total_re = re.compile( '(Ayes )?(\d*)(\s*)Nays(\s*)(\d+)(\s*)Excused(\s*)(\d+)(\s*)Absent(\s*)(\d+)(\s*)Conflicts(\s*)(\d+)' ) actions = [ x.text_content() for x in doc.xpath('//*[@class="actions"]') ] actions = [x.text_content() for x in doc.xpath('//p')] thing = [] pastHeader = False for action in actions: if not pastHeader and action_re.match(action): pastHeader = True if pastHeader: thing.append(action) actions = thing # initial actor is bill chamber actor = bill['chamber'] aiter = iter(actions) for line in aiter: line = clean_line(line) # skip blank lines if not line: continue amatch = action_re.match(line) if amatch: date, achamber, action = amatch.groups() # change actor if one is on this action if achamber == 'H ': actor = 'lower' elif achamber == 'S ': actor = 'upper' date = datetime.datetime.strptime(date, '%m/%d/%Y') bill.add_action(actor, action, date, type=categorize_action(action)) elif line == 'ROLL CALL': voters = {} # if we hit a roll call, use an inner loop to consume lines # in a psuedo-state machine manner, 3 types # Ayes|Nays|Excused|... - indicates next line is voters # : (Senators|Representatives): ... - voters # \d+ Nays \d+ Excused ... - totals while True: nextline = clean_line(aiter.next()) if not nextline: continue breakers = [ "Ayes:", "Nays:", "Nayes:", "Excused:", "Absent:", "Conflicts:" ] for breaker in breakers: if nextline.startswith(breaker): voters_type = breaker[:-1] if voters_type == "Nayes": voters_type = "Nays" self.log("Fixed a case of 'Naye-itis'") nextline = nextline[len(breaker) - 1:] if nextline.startswith(': '): voters[voters_type] = nextline elif nextline in ('Ayes', 'Nays', 'Excused', 'Absent', 'Conflicts'): voters_type = nextline elif vote_total_re.match(nextline): #_, ayes, _, nays, _, exc, _, abs, _, con, _ = \ tupple = vote_total_re.match(nextline).groups() ayes = tupple[1] nays = tupple[4] exc = tupple[7] abs = tupple[10] con = tupple[13] passed = (('Passed' in action or 'Do Pass' in action or 'Did Concur' in action) and 'Failed' not in action) vote = Vote(actor, date, action, passed, int(ayes), int(nays), int(exc) + int(abs) + int(con)) for vtype, voters in voters.iteritems(): for voter in split_names(voters): if vtype == 'Ayes': vote.yes(voter) elif vtype == 'Nays': vote.no(voter) else: vote.other(voter) # done collecting this vote bill.add_vote(vote) break
def scrape_journal(self, url, chamber, session, date): filename, response = self.urlretrieve(url) self.logger.info('Saved journal to %r' % filename) xml = convert_pdf(filename) try: et = lxml.etree.fromstring(xml) except lxml.etree.XMLSyntaxError: self.logger.warning('Skipping invalid pdf: %r' % filename) return lines = self._journal_lines(et) while True: try: line = next(lines) except StopIteration: break text = gettext(line) # Go through with vote parse if any of # these conditions match. if 'Shall' in text: if 'bill pass?' in text: pass elif 'resolution' in text: pass elif 'amendment' in text: pass else: continue else: continue # Get the bill_id. while True: line = next(lines) text += gettext(line) m = re.search(r'\(\s*([A-Z\.]+\s+\d+)\s*\)', text) if m: bill_id = m.group(1) break motion = text.strip() motion = re.sub(r'\s+', ' ', motion) motion, _ = motion.rsplit('(') motion = motion.replace('"', '') motion = motion.replace(u'“', '') motion = motion.replace(u'\u201d', '') motion = motion.replace(u' ,', ',') motion = motion.strip() motion = re.sub(r'[SH].\d+', lambda m: ' %s ' % m.group(), motion) motion = re.sub(r'On the question\s*', '', motion, flags=re.I) for word, letter in (('Senate', 'S'), ('House', 'H'), ('File', 'F')): bill_id = bill_id.replace(word, letter) bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]] self.current_id = bill_id votes = self.parse_votes(lines) totals = filter(lambda x: isinstance(x, int), votes.values()) passed = (1.0 * votes['yes_count'] / sum(totals)) >= 0.5 vote = Vote(motion=motion, passed=passed, chamber=chamber, date=date, session=session, bill_id=bill_id, bill_chamber=bill_chamber, **votes) vote.update(votes) vote.add_source(url) self.save_vote(vote)
def scrape_vote(self, bill, name, url): match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name) if not match: return chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)] motion = match.group(2).strip() if motion.startswith('FINAL PASSAGE'): type = 'passage' elif motion.startswith('AMENDMENT'): type = 'amendment' elif 'ON 3RD READINT' in motion: type = 'reading:3' else: type = 'other' vote = Vote(chamber, None, motion, None, None, None, None) vote['type'] = type vote.add_source(url) with self.urlopen(url) as text: (fd, temp_path) = tempfile.mkstemp() with os.fdopen(fd, 'wb') as w: w.write(text) html = pdf_to_lxml(temp_path) os.remove(temp_path) vote_type = None total_re = re.compile('^Total--(\d+)$') body = html.xpath('string(/html/body)') date_match = re.search('%s (\d{4,4})' % bill['bill_id'], body) date = date_match.group(1) month = int(date[0:2]) day = int(date[2:4]) date = datetime.date(int(bill['session']), month, day) vote['date'] = date for line in body.replace(u'\xa0', '\n').split('\n'): line = line.replace(' ', '').strip() if not line: continue if line in ('YEAS', 'NAYS', 'ABSENT'): vote_type = {'YEAS': 'yes', 'NAYS': 'no', 'ABSENT': 'other'}[line] elif vote_type: match = total_re.match(line) if match: vote['%s_count' % vote_type] = int(match.group(1)) elif vote_type == 'yes': vote.yes(line) elif vote_type == 'no': vote.no(line) elif vote_type == 'other': vote.other(line) # The PDFs oddly don't say whether a vote passed or failed. # Hopefully passage just requires yes_votes > not_yes_votes if vote['yes_count'] > (vote['no_count'] + vote['other_count']): vote['passed'] = True else: vote['passed'] = False bill.add_vote(vote)
def scrape_votes(self, session): votes = {} last_line = [] for line in self.zf.open('tblrollcallsummary.txt'): if line.strip() == "": continue line = line.split('|') if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning('used bad vote line') else: last_line = line self.warning('bad vote line %s' % '|'.join(line)) session_yr = line[0] body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) present = int(line[7]) absent = int(line[8]) motion = line[11].strip() or '[not available]' if session_yr == session and bill_id in self.bills_by_id: actor = 'lower' if body == 'H' else 'upper' time = dt.datetime.strptime(timestamp, '%m/%d/%Y %I:%M:%S %p') # TODO: stop faking passed somehow passed = yeas > nays vote = Vote(actor, time, motion, passed, yeas, nays, other_count=0) votes[body + vote_num] = vote self.bills_by_id[bill_id].add_vote(vote) for line in self.zf.open('tblrollcallhistory.txt'): # 2012 | H | 2 | 330795 | HB309 | Yea |1/4/2012 8:27:03 PM session_yr, body, v_num, employee, bill_id, vote, date \ = line.split('|') if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = self.legislators[employee]['name'] except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if not body + v_num in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body + v_num)) continue #code = self.legislators[employee]['seat'] if vote == 'Yea': votes[body + v_num].yes(leg) elif vote == 'Nay': votes[body + v_num].no(leg) else: votes[body + v_num].other(leg) votes[body + v_num]['other_count'] += 1
def scrape_votes(self, url, motion, date, chamber): vote_pdf, resp = self.urlretrieve(url) text = convert_pdf(vote_pdf, 'text') os.remove(vote_pdf) # this way we get a key error on a missing vote type motion, passed = self._vote_mapping[motion] yes_votes = [] no_votes = [] other_votes = [] # point at array to add names to cur_array = None precursors = ( ('Yeas--', yes_votes), ('Nays--', no_votes), ('Absent or those not voting--', other_votes), ('Absent and those not voting--', other_votes), ('Voting Present--', other_votes), ('Present--', other_votes), ('DISCLAIMER', None), ) # split lines on newline, recombine lines that don't end in punctuation lines = _combine_lines(text.split('\n')) for line in lines: # check if the line starts with a precursor, switch to that array for pc, arr in precursors: if pc in line: cur_array = arr line = line.replace(pc, '') # split names for name in line.split(','): name = name.strip() # move on if that's all there was if not name: continue # None or a Total indicate the end of a section if 'None.' in name: cur_array = None match = re.match(r'(.+?)\. Total--.*', name) if match: cur_array.append(match.groups()[0]) cur_array = None # append name if it looks ok junk_in_name = False for junk in ('on final passage', 'Necessary', 'who would have', 'being a tie', 'therefore', 'Vacancies', 'a pair', 'Total-', 'ATTORNEY', 'on final passage', 'SPEAKER', 'BOARD', 'TREASURER', 'GOVERNOR', 'ARCHIVES', 'SECRETARY'): if junk in name: junk_in_name = True break if cur_array is not None and not junk_in_name: # strip trailing . if name[-1] == '.': name = name[:-1] cur_array.append(name) # return vote object yes_count = len(yes_votes) no_count = len(no_votes) other_count = len(other_votes) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote['yes_votes'] = yes_votes vote['no_votes'] = no_votes vote['other_votes'] = other_votes return vote
def scrape_senate_vote(self, bill, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') os.remove(path) lines = text.split('\n') date_match = re.search(r'Date:\s+(\d+/\d+/\d+)', text) if not date_match: self.log("Couldn't find date on %s" % url) return time_match = re.search(r'Time:\s+(\d+:\d+:\d+)\s+(AM|PM)', text) date = "%s %s %s" % (date_match.group(1), time_match.group(1), time_match.group(2)) date = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p") date = self._tz.localize(date) vote_type = None yes_count, no_count, other_count = None, None, 0 votes = [] for line in lines[21:]: line = line.strip() if not line: continue if line.startswith('YEAS'): yes_count = int(line.split(' - ')[1]) vote_type = 'yes' elif line.startswith('NAYS'): no_count = int(line.split(' - ')[1]) vote_type = 'no' elif line.startswith('EXCUSED') or line.startswith('NOT VOTING'): other_count += int(line.split(' - ')[1]) vote_type = 'other' else: votes.extend([(n.strip(), vote_type) for n in re.split(r'\s{2,}', line)]) if yes_count is None or no_count is None: self.log("Couldne't find vote counts in %s" % url) return passed = yes_count > no_count + other_count clean_bill_id = fix_bill_id(bill['bill_id']) motion_line = None for i, line in enumerate(lines): if line.strip() == clean_bill_id: motion_line = i + 2 motion = lines[motion_line] if not motion: self.log("Couldn't find motion for %s" % url) return vote = Vote('upper', date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) insert_specific_votes(vote, votes) check_vote_counts(vote) bill.add_vote(vote)
def scrape_bill_sheet(self, session, chamber): """ Scrape the bill sheet (the page full of bills and other small bits of data) """ sheet_url = self.get_bill_folder(session, chamber) bill_chamber = {"Senate": "upper", "House": "lower"}[chamber] index = { "id": 0, "title_sponsor": 1, "version": 2, "history": 3, "votes": 7 } sheet_html = self.urlopen(sheet_url) sheet_page = lxml.html.fromstring(sheet_html) bills = sheet_page.xpath('//table/tr') for bill in bills: bill_id = self.read_td(bill[index["id"]][0]) if bill_id == None: # Every other entry is null for some reason continue dot_loc = bill_id.find('.') if dot_loc != -1: # budget bills are missing the .pdf, don't truncate bill_id = bill_id[:dot_loc] title_and_sponsor = bill[index["title_sponsor"]][0] bill_title = title_and_sponsor.text bill_title_and_sponsor = title_and_sponsor.text_content() if bill_title is None: continue # Odd ... sponsors = bill_title_and_sponsor.replace(bill_title, "").\ replace(" & ...", "").split("--") cats = { "SB": "bill", "HB": "bill", "HR": "resolution", "SR": "resolution", "SCR": "concurrent resolution", "HCR": "concurrent resolution", "SJR": "joint resolution", "HJR": "joint resolution", "SM": "memorial", "HM": "memorial" } bill_type = None for cat in cats: if bill_id[:len(cat)] == cat: bill_type = cats[cat] b = Bill(session, bill_chamber, bill_id, bill_title, type=bill_type) b.add_source(sheet_url) versions_url = \ bill[index["version"]].xpath('font/a')[0].attrib["href"] versions_url = CO_URL_BASE + versions_url versions = self.parse_versions(versions_url) for version in versions: b.add_version(version['name'], version['link'], mimetype=version['mimetype']) bill_history_href = CO_URL_BASE + \ bill[index["history"]][0][0].attrib['href'] # ^^^^^^^ We assume this is a full path to the target. # might want to consider some better rel-path support # XXX: Look at this ^ history = self.parse_history(bill_history_href) b.add_source(bill_history_href) chamber_map = dict(Senate='upper', House='lower') for action, date in history: action_actor = chamber_map.get(chamber, chamber) attrs = dict(actor=action_actor, action=action, date=date) attrs.update(self.categorizer.categorize(action)) b.add_action(**attrs) for sponsor in sponsors: if sponsor != None and sponsor != "(NONE)" and \ sponsor != "": b.add_sponsor("primary", sponsor) # Now that we have history, let's see if we can't grab some # votes bill_vote_href = self.get_vote_url(bill_id, session) votes = self.parse_votes(bill_vote_href) if votes['sanity-check'] != bill_id: self.warning("XXX: READ ME! Sanity check failed!") self.warning(" -> Scraped ID: " + votes['sanity-check']) self.warning(" -> 'Real' ID: " + bill_id) assert votes['sanity-check'] == bill_id for vote in votes['votes']: filed_votes = vote['votes'] passage = vote['meta'] result = vote['result'] composite_time = "%s %s" % (passage['x-parent-date'], passage['TIME']) # It's now like: 04/01/2011 02:10:14 PM pydate = dt.datetime.strptime(composite_time, "%m/%d/%Y %I:%M:%S %p") hasHouse = "House" in passage['x-parent-ctty'] hasSenate = "Senate" in passage['x-parent-ctty'] if hasHouse and hasSenate: actor = "joint" elif hasHouse: actor = "lower" else: actor = "upper" other = (int(result['EXC']) + int(result['ABS'])) # OK, sometimes the Other count is wrong. local_other = 0 for voter in filed_votes: l_vote = filed_votes[voter].lower().strip() if l_vote != "yes" and l_vote != "no": local_other = local_other + 1 if local_other != other: self.warning( \ "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES") self.warning(" -> Old: %s // New: %s" % (other, local_other)) other = local_other passed = (result['FINAL_ACTION'] == "PASS") if passage['MOTION'].strip() == "": continue if "without objection" in passage['MOTION'].lower(): passed = True v = Vote(actor, pydate, passage['MOTION'], passed, int(result['YES']), int(result['NO']), other, moved=passage['MOVED'], seconded=passage['SECONDED']) v.add_source(vote['meta']['url']) # v.add_source( bill_vote_href ) # XXX: Add more stuff to kwargs, we have a ton of data seen = set([]) for voter in filed_votes: who = voter if who in seen: raise Exception("Seeing the double-thing. - bug #702") seen.add(who) vote = filed_votes[who] if vote.lower() == "yes": v.yes(who) elif vote.lower() == "no": v.no(who) else: v.other(who) b.add_vote(v) self.save_bill(b)
def scrape(self, session, chambers): HTML_TAGS_RE = r'<.*?>' year_slug = session[5:] # Load all bills and resolutions via the private API bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)['data'] or [] bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)['data'] or []) resolutions_url = \ 'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\ format(year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)['data'] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.iteritems()} # Identify the bill type and chamber if info['BillNumber'].startswith('J.R.H.'): bill_type = 'joint resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('J.R.S.'): bill_type = 'joint resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.R.'): bill_type = 'resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.R.'): bill_type = 'resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('PR.'): bill_type = 'constitutional amendment' if info['Body'] == 'H': bill_chamber = 'lower' elif info['Body'] == 'S': bill_chamber = 'upper' else: raise AssertionError("Amendment not tied to chamber") elif info['BillNumber'].startswith('H.'): bill_type = 'bill' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.'): bill_type = 'bill' bill_chamber = 'upper' else: raise AssertionError("Unknown bill type found: '{}'".format( info['BillNumber'])) # Create the bill using its basic information bill = Bill(session=session, bill_id=info['BillNumber'], title=info['Title'], chamber=bill_chamber, type=bill_type) if 'resolution' in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = \ 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\ format(year_slug, info['BillNumber']) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' 'following-sibling::dd[1]/ul/li') sponsor_type = 'primary' for sponsor in sponsors: if sponsor.xpath('span/text()') == ['Additional Sponsors']: sponsor_type = 'cosponsor' continue sponsor_name = sponsor.xpath('a/text()')[0].\ replace("Rep.", "").replace("Sen.", "").strip() if sponsor_name and not \ (sponsor_name[ :5] == "Less" and len(sponsor_name) == 5): bill.add_sponsor(sponsor_type, sponsor_name) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' 'following-sibling::dd[1]/ul/li/a |' '//ul[@class="bill-path"]//a') for version in versions: if version.xpath('text()'): bill.add_version(name=version.xpath('text()')[0], url=version.xpath('@href')[0].replace( ' ', '%20'), mimetype='application/pdf') # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/{}/(\d+)"'.format( year_slug), lxml.etree.tostring(doc)).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".\ format(info['BillNumber'])) self.save_bill(bill) continue # Capture actions actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\ format(year_slug, internal_bill_id) actions_json = self.get(actions_url).text actions = json.loads(actions_json)['data'] bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v.strip() for k, v in action.iteritems()} if "Signed by Governor" in action['FullStatus']: actor = 'governor' elif action['ChamberCode'] == 'H': actor = 'lower' elif action['ChamberCode'] == 'S': actor = 'upper' else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action['FullStatus']: assert chambers_passed == set("HS") action_type = 'governor:signed' elif actor == 'lower' and \ any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'bill:passed' chambers_passed.add("H") elif actor == 'upper' and \ any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'bill:passed' chambers_passed.add("S") else: action_type = 'other' bill.add_action(actor=actor, action=re.sub(HTML_TAGS_RE, "", action['FullStatus']), date=datetime.datetime.strptime( action['StatusDate'], '%m/%d/%Y'), type=action_type) # Capture votes votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.\ format(year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)['data'] bill.add_source(votes_url) for vote in votes: roll_call_id = vote['VoteHeaderID'] roll_call_url = 'http://legislature.vermont.gov/bill/loadBillRollCallDetails/{0}/{1}'.\ format(year_slug, roll_call_id) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)['data'] roll_call_yea = [] roll_call_nay = [] roll_call_other = [] for member in roll_call: (member_name, _district) = member['MemberName'].split(" of ") member_name = member_name.strip() if member['MemberVote'] == "Yea": roll_call_yea.append(member_name) elif member['MemberVote'] == "Nay": roll_call_nay.append(member_name) else: roll_call_other.append(member_name) if "Passed -- " in vote['FullStatus']: did_pass = True elif "Failed -- " in vote['FullStatus']: did_pass = False else: raise AssertionError("Roll call vote result is unclear") # Check vote counts yea_count = \ int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1)) nay_count = \ int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1)) vote_to_add = Vote(chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'), date=datetime.datetime.strptime( vote['StatusDate'], '%m/%d/%Y'), motion=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(), passed=did_pass, yes_count=yea_count, no_count=nay_count, other_count=len(roll_call_other)) vote_to_add.add_source(roll_call_url) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_other: vote_to_add.other(member) try: vote_to_add.validate() except ValueError as e: self.warning(e) bill.add_vote(vote_to_add) # Capture extra information # This is not in the OpenStates spec, but is available # Not yet implemented # Witnesses: http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} self.save_bill(bill)
def scrape_bill_pages(self, session, year_abr): """ assemble information on a bill from a number of DBF files """ #Main Bill information main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) bill.add_source(main_bill_url) bill_dict[bill_id] = bill #Sponsors bill_sponsors_url, bill_sponsors_db = self.get_dbf( year_abr, 'BILLSPON') for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == 'P': sponsor_type = "Primary" else: sponsor_type = "Co-sponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP') #print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in document database' % bill_id) continue bill = bill_dict[bill_id] document = rec["document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % ( year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype try: doc_name = self._doctypes[rec['doctype']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['doctype'], bill_id)) if rec['comment']: doc_name += ' ' + rec['comment'] if rec['doctype'] in self._version_types: bill.add_version(doc_name, htm_url) else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr) + 1 vote_info_list = [ 'A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning('could not find %s' % s_vote_url) continue zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % filename vote_file = zipedfile.open(vfile, 'U') vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == 'committee': votes[vote_id]['committee'] = self._committees[ rec['Committee_House']] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST') actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'} for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = actor_map[rec["house"]] comment = rec["comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ') for rec in subject_db: bill_id = rec['billtype'] + str(int(rec['billnumber'])) bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['subjectkey']) else: self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id) # save all bills at the end for bill in bill_dict.itervalues(): # add sources bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) bill.add_source(subject_url) self.save_bill(bill)
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.urlopen(url).replace(u'\xa0', ' ')) re_ns = "http://exslt.org/regular-expressions" path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={'re': re_ns}): if 'HOUSE' in header.xpath("string()"): chamber = 'lower' motion_index = 8 else: chamber = 'upper' motion_index = 9 motion = header.xpath( "string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r'\s+', ' ', motion) match = re.match(r'^(.*) (PASSED|FAILED)$', motion) if match: motion = match.group(1) passed = match.group(2) == 'PASSED' else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ') rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1) date_line = rcs_p.getnext().xpath("string()") date = re.search(r'\d+/\d+/\d+', date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace('\r\n', ' ').strip() if "*****" in line: break match = re.match( r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL PRIVILEGE|NOT VOTING)\s*:\s*(\d+)', line) if match: if match.group(1) == 'YEAS': vtype = 'yes' elif match.group(1) == 'NAYS': vtype = 'no' elif match.group(1) == 'VACANT': continue # skip these else: vtype = 'other' counts[vtype] += int(match.group(2)) else: for name in line.split(' '): if not name: continue if 'HOUSE BILL' in name or 'SENATE BILL' in name: continue votes[vtype].append(name.strip()) assert len(votes['yes']) == counts['yes'] assert len(votes['no']) == counts['no'] assert len(votes['other']) == counts['other'] if passed is None: passed = counts['yes'] > (counts['no'] + counts['other']) vote = Vote(chamber, date, motion, passed, counts['yes'], counts['no'], counts['other'], rcs_num=rcs) vote.add_source(url) for name in votes['yes']: vote.yes(name) for name in votes['no']: vote.no(name) for name in votes['other']: vote.other(name) bill.add_vote(vote)
def scrape_bill_type(self, chamber, session, bill_type, type_abbr): if chamber == 'upper': chamber_name = 'SENATE' else: chamber_name = 'ASSEMBLY' bills = self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_session, chamber, bill_id, '') # Construct session for web query, going from '20092010' to '0910' source_session = session[2:4] + session[6:8] # Turn 'AB 10' into 'ab_10' source_num = "%s_%s" % (bill.measure_type.lower(), bill.measure_num) # Construct a fake source url source_url = ("http://www.leginfo.ca.gov/cgi-bin/postquery?" "bill_number=%s&sess=%s" % (source_num, source_session)) fsbill.add_source(source_url) scraped_versions = self.scrape_site_versions(source_url) title = '' short_title = '' type = ['bill'] subject = '' all_titles = set() i = 0 for version in bill.versions: if not version.bill_xml: continue title = clean_title(version.title) all_titles.add(title) short_title = clean_title(version.short_title) type = [bill_type] if version.appropriation == 'Yes': type.append('appropriation') if version.fiscal_committee == 'Yes': type.append('fiscal committee') if version.local_program == 'Yes': type.append('local program') if version.urgency == 'Yes': type.append('urgency') if version.taxlevy == 'Yes': type.append('tax levy') if version.subject: subject = clean_title(version.subject) date = version.bill_version_action_date.date() url = '' try: scraped_version = scraped_versions[i] if scraped_version[0] == date: url = scraped_version[1] i += 1 except IndexError: pass fsbill.add_version(version.bill_version_id, url, date=date, title=title, short_title=short_title, subject=[subject], type=type) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill['title'] = title fsbill['short_title'] = short_title fsbill['type'] = type fsbill['subjects'] = [subject] # We don't want the current title in alternate_titles all_titles.remove(title) fsbill['alternate_titles'] = list(all_titles) for author in version.authors: if author.house == chamber_name: fsbill.add_sponsor(author.contribution, author.name) introduced = False for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = { 'Assembly': 'lower', 'Senate': 'upper' }[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: actor = re.sub('^Assembly', 'lower', actor) actor = re.sub('^Senate', 'upper', actor) type = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) if act_str.startswith('Introduced'): introduced = True type.append('bill:introduced') if 'Read first time.' in act_str: if not introduced: type.append('bill:introduced') introduced = True type.append('bill:reading:1') if 'To Com' in act_str or 'referred to' in act_str.lower(): type.append('committee:referred') if 'Read third time. Passed.' in act_str: type.append('bill:passed') if 'Approved by Governor' in act_str: type.append('governor:signed') if 'Item veto' in act_str: type.append('governor:vetoed:line-item') if 'Vetoed by Governor' in act_str: type.append('governor:vetoed') if 'To Governor' in act_str: type.append('governor:received') if 'Read second time' in act_str: type.append('bill:reading:2') if not type: type = ['other'] fsbill.add_action(actor, act_str, action.action_date.date(), type=type) for vote in bill.votes: if vote.vote_result == '(PASS)': result = True else: result = False full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) motion = vote.motion.motion_text or '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub( r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue fsvote = Vote(vote_chamber, self._tz.localize(vote.vote_date_time), motion, result, int(vote.ayes), int(vote.noes), int(vote.abstain), threshold=vote.threshold, type=vtype) if vote_location != 'Floor': fsvote['committee'] = vote_location for record in vote.votes: if record.vote_code == 'AYE': fsvote.yes(record.legislator_name) elif record.vote_code.startswith('NO'): fsvote.no(record.legislator_name) else: fsvote.other(record.legislator_name) # The abstain count field in CA's database includes # vacancies, which we aren't interested in. fsvote['other_count'] = len(fsvote['other_votes']) fsbill.add_vote(fsvote) self.save_bill(fsbill)
def _parse_votes(self, url, vote): '''Given a vote url and a vote object, extract the voters and the vote counts from the vote page and update the vote object. ''' if url.lower().endswith('.pdf'): try: resp = self.urlopen(url) except HTTPError: # This vote document wasn't found. msg = 'No document found at url %r' % url self.logger.warning(msg) return try: v = PDFCommitteeVote(url, resp.bytes) return v.asvote() except PDFCommitteeVoteParseError as e: # Warn and skip. self.warning("Could't parse committee vote at %r" % url) return keymap = {'Y': 'yes', 'N': 'no'} html = self.urlopen(url) doc = lxml.html.fromstring(html) # Yes, no, excused, absent. try: vals = doc.xpath('//table')[1].xpath('tr/td/text()') except IndexError: # Most likely was a bogus link lacking vote data. return y, n, e, a = map(int, vals) vote.update(yes_count=y, no_count=n, other_count=e + a) # Get the motion. try: motion = doc.xpath('//br')[-1].tail.strip() except: # Some of them mysteriously have no motion listed. motion = vote['action'] vote['motion'] = motion # Add placeholder for passed (see below) vote['passed'] = False vote = Vote(**vote) for text in doc.xpath('//table')[2].xpath('tr/td/text()'): if not text.strip(u'\xa0'): continue v, name = filter(None, text.split(u'\xa0')) getattr(vote, keymap.get(v, 'other'))(name) action = vote['action'] # Existing code to deterimine value of `passed` yes_votes = vote['yes_votes'] no_votes = vote['no_votes'] passed = None # some actions take a super majority, so we aren't just # comparing the yeas and nays here. for i in vote_passage_indicators: if action.count(i): passed = True for i in vote_failure_indicators: if action.count(i) and passed == True: # a quick explanation: originally an exception was # thrown if both passage and failure indicators were # present because I thought that would be a bug in my # lists. Then I found 2007 HB 160. # Now passed = False if the nays outnumber the yays.. # I won't automatically mark it as passed if the yays # ounumber the nays because I don't know what requires # a supermajority in MT. if no_votes >= yes_votes: passed = False else: raise Exception("passage and failure indicator" "both present at: %s" % url) if action.count(i) and passed == None: passed = False for i in vote_ambiguous_indicators: if action.count(i): passed = yes_votes > no_votes if passed is None: raise Exception("Unknown passage at: %s" % url) vote['passed'] = passed return vote
def scrape(self, chamber, session): if chamber not in PAGES: return url = PAGES[chamber] page = self.lxmlize(url) pdfs = page.xpath("//a[contains(@href, '.pdf')]") for pdf in pdfs: bill_id = None results = {} in_vote = False cur_date = None in_motion = False cur_vote = None in_vote = False cur_motion = "" pdf_url = pdf.attrib['href'] (path, response) = self.urlretrieve(pdf_url) data = convert_pdf(path, type='text') os.unlink(path) lines = data.splitlines() for line in lines: date = re.findall(date_re, line) if date != [] and not cur_date: date = date[0][0] cur_date = datetime.datetime.strptime( date, "%A, %B %d, %Y") if line.strip() == "": in_motion = False continue if True in [x in line.lower() for x in ['passed', 'lost']] and in_vote: in_vote = False bills = re.findall(r"(?i)(H|S|J)(B|R|M) (\d+)", line) if bills == [] or cur_motion.strip() == "": bill_id = None results = {} in_vote = False in_motion = False cur_vote = None in_vote = False continue print "CM: ", cur_motion cur_bill_id = "%s%s %s" % (bills[-1]) keys = { "YEAS": "yes", "NAYS": "no", "ABSENT AND NOT VOTING": "other" } res = {} for key in keys: if key in results: res[keys[key]] = filter(lambda a: a != "", results[key]) else: res[keys[key]] = [] # results results = {} yes, no, other = len(res['yes']), len(res['no']), \ len(res['other']) chambers = {"H": "lower", "S": "upper", "J": "joint"} try: bc = chambers[cur_bill_id[0]] except KeyError: bc = 'other' vote = Vote(chamber, cur_date, cur_motion, (yes > no), yes, no, other, session=session, bill_id=cur_bill_id, bill_chamber=bc) vote.add_source(pdf_url) vote.add_source(url) for key in res: obj = getattr(vote, key) for person in res[key]: obj(person) self.save_vote(vote) bill_id = None results = {} in_vote = False in_motion = False cur_vote = None in_vote = False cur_motion = "" # print bills # print "VOTE TAKEN" if 'VOTES FOR' in line: in_motion = False in_vote = False continue if 'ABSET' in line: if in_motion: in_vote = True in_motion = False if ":" in line and in_vote: cur_vote, who = line.split(":", 1) who = [x.strip() for x in who.split(';')] results[cur_vote] = who continue if in_vote: if cur_vote is None: continue who = [x.strip() for x in line.split(";")] for person in who: # print cur_vote results[cur_vote].append(person) continue if "question being" in line: cur_motion = line.strip() in_motion = True continue if in_motion: cur_motion += line.strip() continue if line.strip() == 'ROLL CALL': in_vote = True
def scrape(self, chamber, session): # Unfortunately, you now have to request access to FTP. # This method of retrieving votes needs to be be changed or # fall back to traditional web scraping. if session == '2009': # 2009 files have a different delimiter and naming scheme. vote_data_url = 'ftp://www.ncleg.net/Bill_Status/Vote Data 2009.zip' naming_scheme = '{session}{file_label}.txt' delimiter = ";" else: vote_data_url = 'ftp://www.ncleg.net/Bill_Status/Votes%s.zip' % session naming_scheme = '{file_label}_{session}.txt' delimiter = "\t" fname, resp = self.urlretrieve(vote_data_url) # fname = "/Users/brian/Downloads/Vote Data 2009.zip" zf = ZipFile(fname) chamber_code = 'H' if chamber == 'lower' else 'S' # Members_YYYY.txt: tab separated # 0: id (unique only in chamber) # 1: H or S # 2: member name # 3-5: county, district, party # 6: mmUserId member_file = zf.open(naming_scheme.format(file_label='Members', session=session)) members = {} for line in member_file.readlines(): data = line.split(delimiter) if data[1] == chamber_code: members[data[0]] = data[2] # Votes_YYYY.txt # 0: sequence number # 1: chamber (S/H) # 2: date # 3: prefix # 4: bill_id # 5: yes votes # 6: no votes # 7: excused absences # 8: excused votes # 9: didn't votes # 10: total yes+no # 11: sponsor # 12: reading info # 13: info # 20: PASSED/FAILED # 21: legislative day vote_file = zf.open(naming_scheme.format(file_label='Votes', session=session)) bill_chambers = {'H':'lower', 'S':'upper'} votes = {} for line in vote_file.readlines(): data = line.split(delimiter) if len(data) < 24: self.warning('line too short %s', data) continue if data[1] == chamber_code: date = datetime.datetime.strptime(data[2][:16], '%Y-%m-%d %H:%M') if data[3][0] not in bill_chambers: # skip votes that aren't on bills self.log('skipping vote %s' % data[0]) continue votes[data[0]] = Vote(chamber, date, data[13], 'PASS' in data[20], int(data[5]), int(data[6]), int(data[7])+int(data[8])+int(data[9]), bill_chamber=bill_chambers[data[3][0]], bill_id=data[3]+data[4], session=session) member_vote_file = zf.open(naming_scheme.format(file_label='MemberVotes', session=session)) # 0: member id # 1: chamber (S/H) # 2: vote id # 3: vote chamber (always same as 1) # 4: vote (Y,N,E,X) # 5: pair ID (member) # 6: pair order # If a vote is paired then it should be counted as an 'other' for line in member_vote_file.readlines(): data = line.split(delimiter) if data[1] == chamber_code: try: member_voting = members[data[0]] except KeyError: self.debug('Member %s not found.' % data[0]) continue try: vote = votes[data[2]] except KeyError: self.debug('Vote %s not found.' % data[2]) continue # -1 votes are Lt. Gov, not included in count, so we add them if data[4] == 'Y' and not data[5]: if data[0] == '-1': vote['yes_count'] += 1 vote.yes(member_voting) elif data[4] == 'N' and not data[5]: if data[0] == '-1': vote['no_count'] += 1 vote.no(member_voting) else: # for some reason other_count is high for paired votes if data[5]: vote['other_count'] -= 1 # is either E: excused, X: no vote, or paired (doesn't count) vote.other(member_voting) for vote in votes.itervalues(): #vote.validate() vote.add_source(vote_data_url) self.save_vote(vote) # remove file zf.close() os.remove(fname)
def scrape_vote(self, bill, name, url): if "VOTE/H" in url: vote_chamber = 'lower' cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = 'upper' cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 # Connecticut's SSL is causing problems with Scrapelib, so use Requests page = requests.get(url, verify=False).text if 'BUDGET ADDRESS' in page: return page = lxml.html.fromstring(page) yes_count = page.xpath( "string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1)) no_count = page.xpath( "string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1)) other_count = page.xpath("string(//span[contains(., 'Those absent')])") other_count = int(re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1)) need_count = page.xpath("string(//span[contains(., 'Necessary for')])") need_count = int(re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1) date = date.replace(' ', '') date = datetime.datetime.strptime(date + " " + bill['session'], "%m/%d %Y").date() vote = Vote(vote_chamber, date, name, yes_count > need_count, yes_count, no_count, other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % (i + name_offset)).strip() if not name or name == 'VACANT': continue if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.other(name) bill.add_vote(vote)
def _scrape_bill_details(self, url, bill): html = self.get(url, retry_on_404=True).text doc = lxml.html.fromstring(html) # summary sections summary = doc.xpath( '//h4[starts-with(text(), "SUMMARY")]/following-sibling::p/text()') if summary and summary[0].strip(): bill['summary'] = summary[0].strip() # versions for va in doc.xpath( '//h4[text()="FULL TEXT"]/following-sibling::ul[1]/li/a[1]'): # 11/16/09 \xa0House: Prefiled and ordered printed; offered 01/13/10 10100110D date, desc = va.text.split(u' \xa0') desc.rsplit(' ', 1)[0] # chop off last part link = va.get('href') if 'http' not in link: link = '{}{}'.format(BASE_URL, link) date = datetime.datetime.strptime(date, '%m/%d/%y') # budget bills in VA are searchable but no full text available if '+men+' in link: self.warning( 'not adding budget version, bill text not available') else: # VA duplicates reprinted bills, lets keep the original name bill.add_version(desc, link, date=date, mimetype='text/html', on_duplicate='use_old') # actions cached_vote = None cached_action = None for ali in doc.xpath('//h4[text()="HISTORY"]/following-sibling::ul[1]/' 'li'): vote = None date, action = ali.text_content().split(u' \xa0') actor, action = action.split(': ', 1) # Bill history entries purely in parentheses tend to be # notes and not actions, so we'll skip them. if action.startswith('(') and action.endswith(')'): continue actor = self.actor_map[actor] date = datetime.datetime.strptime(date.strip(), '%m/%d/%y') # if action ends in (##-Y ##-N) remove that part vrematch = self.vote_strip_re.match(action) # The following conditional logic is messy to handle # Virginia's crazy and inconsistently formatted bill # histories. Someone less harried and tired than me # could probably make this much cleaner. - alo if vrematch: vote_action, y, n, o = vrematch.groups() y = int(y) n = int(n) # Set default count for "other" votes to 0. We have to # do this explicitly as it's excluded from the action # text when there were no abstentions (the only type of # "other" vote encountered thus far). if o is None: o = 0 else: o = int(o) vote_url = ali.xpath('a/@href') # Caches relevant information from the current action if # vote count encountered, then searches for the presence # of identical counts in the next entry (we assume that # it's probably there). If matching votes are found, it # pulls the cached data to create a unified vote record. # # This is because Virginia usually publishes two lines # of history data for a single vote, without guaranteed # order, so we cache and unsafely attempt to match on # identical vote counts in the next line. if cached_vote is None: cached_action = action cached_vote = Vote(actor, date, vote_action, y > n, y, n, o) if vote_url: cached_vote.add_source(BASE_URL + vote_url[0]) continue elif cached_vote is not None: if vote_action.startswith(u'VOTE:'): if (vote_url and cached_vote['yes_count'] == y and cached_vote['no_count'] == n and cached_vote['other_count'] == o): vote = cached_vote self._parse_vote(vote, vote_url[0]) vote.add_source(BASE_URL + vote_url[0]) action = cached_action elif cached_vote['motion'].startswith('VOTE:'): if (cached_vote['yes_count'] == y and cached_vote['no_count'] == n and cached_vote['other_count'] == o): vote = cached_vote vote['motion'] = vote_action else: # Cached vote doesn't match up to the current # one. Save, then cache the current vote to # begin the next search. bill.add_vote(cached_vote) cached_vote = Vote(actor, date, vote_action, y > n, y, n, o) if vote_url: cached_vote.add_source(BASE_URL + vote_url[0]) cached_action = action continue if vote: bill.add_vote(vote) else: self.error('empty vote') else: # If this action isn't a vote, but the last one was, # there's obviously no additional vote data to match. # Go ahead and save the cached data. if cached_vote is not None: bill.add_vote(cached_vote) cached_vote = cached_action = None # categorize actions for pattern, atype in self._action_classifiers: if re.match(pattern, action): break else: atype = 'other' # if matched a 'None' atype, don't add the action if atype: bill.add_action(actor, action, date, type=atype)
def parse_bill_votes(self, doc, bill): params = { 'chamber': None, 'date': None, 'motion': None, 'passed': None, 'yes_count': None, 'no_count': None, 'other_count': None, } elems = doc.cssselect('a') # MD has a habit of listing votes twice seen_votes = set() for elem in elems: href = elem.get('href') if (href and "votes" in href and href.endswith('htm') and href not in seen_votes): seen_votes.add(href) vote_url = BASE_URL + href with self.urlopen(vote_url) as vote_html: vote_doc = lxml.html.fromstring(vote_html) # motion box = vote_doc.xpath( '//td[@colspan=3]/font[@size=-1]/text()') params['motion'] = box[-1] params['type'] = 'other' if 'senate' in href: params['chamber'] = 'upper' else: params['chamber'] = 'lower' for regex, vtype in vote_classifiers.iteritems(): if re.findall(regex, params['motion'], re.IGNORECASE): params['type'] = vtype # counts bs = vote_doc.xpath('//td[@width="20%"]/font/b/text()') yeas = int(bs[0].split()[0]) nays = int(bs[1].split()[0]) excused = int(bs[2].split()[0]) not_voting = int(bs[3].split()[0]) absent = int(bs[4].split()[0]) params['yes_count'] = yeas params['no_count'] = nays params['other_count'] = excused + not_voting + absent params['passed'] = yeas > nays # date # parse the following format: March 23, 2009 date_elem = vote_doc.xpath( '//font[starts-with(text(), "Legislative Date")]')[0] params['date'] = datetime.datetime.strptime( date_elem.text[18:], '%B %d, %Y') vote = Vote(**params) status = None for row in vote_doc.cssselect('table')[3].cssselect('tr'): text = row.text_content() if text.startswith('Voting Yea'): status = 'yes' elif text.startswith('Voting Nay'): status = 'no' elif text.startswith('Not Voting') or text.startswith( 'Excused'): status = 'other' else: for cell in row.cssselect('a'): getattr(vote, status)(cell.text.strip()) vote.add_source(vote_url) bill.add_vote(vote)