def scrape_bills(self, chamber, year): if year != "2009": raise NoDataForYear if chamber == "upper": other_chamber = "lower" bill_id = "SB 1" else: other_chamber = "upper" bill_id = "HB 1" b1 = Bill("2009-2010", chamber, bill_id, "A super bill") b1.add_source("http://example.com") b1.add_version("As Introduced", "http://example.com/SB1.html") b1.add_document("Google", "http://google.com") b1.add_sponsor("primary", "Bob Smith") b1.add_sponsor("secondary", "Johnson, Sally") d1 = datetime.datetime.strptime("1/29/2010", "%m/%d/%Y") v1 = Vote("upper", d1, "Final passage", True, 2, 0, 0) v1.yes("Bob Smith") v1.yes("Sally Johnson") d2 = datetime.datetime.strptime("1/30/2010", "%m/%d/%Y") v2 = Vote("lower", d2, "Final passage", False, 0, 1, 1) v2.no("B. Smith") v2.other("Sally Johnson") b1.add_vote(v1) b1.add_vote(v2) b1.add_action(chamber, "introduced", d1) b1.add_action(chamber, "read first time", d1) b1.add_action(other_chamber, "introduced", d2) self.save_bill(b1)
def scrape_session(self, chamber, year): if chamber == "upper": bill_abbr = "SB|SCR|SJR" elif chamber == "lower": bill_abbr = "HB|HCR|HJR" # Sessions last 2 years, 1993-1994 was the 18th session = str(18 + ((int(year) - 1993) / 2)) year2 = str(int(year) + 1) # Full calendar year date1 = "0101" + year[2:] date2 = "1231" + year2[2:] # Get bill list bill_list_url = "http://www.legis.state.ak.us/" "basis/range_multi.asp?session=%s&date1=%s&date2=%s" % ( session, date1, date2, ) self.log("Getting bill list for %s %s (this may take a long time)." % (chamber, session)) bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Find bill links re_str = "bill=%s\d+" % bill_abbr links = bill_list.findAll(href=re.compile(re_str)) for link in links: bill_id = link.contents[0].replace(" ", "") bill_name = link.parent.parent.findNext("td").find("font").contents[0].strip() bill = Bill(session, chamber, bill_id, bill_name.strip()) # Get the bill info page and strip malformed t info_url = "http://www.legis.state.ak.us/basis/%s" % link["href"] info_page = self.soup_parser(self.urlopen(info_url)) bill.add_source(info_url) # Get sponsors spons_str = info_page.find(text="SPONSOR(s):").parent.parent.contents[1] sponsors_match = re.match(" (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})", spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(",") bill.add_sponsor("primary", sponsors[0].strip()) for sponsor in sponsors[1:]: bill.add_sponsor("cosponsor", sponsor.strip()) else: # Committee sponsorship bill.add_sponsor("committee", spons_str.strip()) # Get actions act_rows = info_page.findAll("table", "myth")[1].findAll("tr")[1:] for row in act_rows: cols = row.findAll("td") act_date = cols[0].font.contents[0] act_date = dt.datetime.strptime(act_date, "%m/%d/%y") if cols[2].font.string == "(H)": act_chamber = "lower" elif cols[2].font.string == "(S)": act_chamber = "upper" else: act_chamber = chamber action = cols[3].font.contents[0].strip() if re.match("\w+ Y(\d+) N(\d+)", action): vote = self.parse_vote(bill, action, act_chamber, act_date, cols[1].a["href"]) bill.add_vote(vote) bill.add_action(act_chamber, action, act_date) # Get subjects bill["subjects"] = [] subject_link_re = re.compile(".*subject=\w+$") for subject_link in info_page.findAll("a", href=subject_link_re): subject = subject_link.contents[0].strip() bill["subjects"].append(subject) # Get versions text_list_url = "http://www.legis.state.ak.us/" "basis/get_fulltext.asp?session=%s&bill=%s" % ( session, bill_id, ) text_list = self.soup_parser(self.urlopen(text_list_url)) bill.add_source(text_list_url) text_link_re = re.compile("^get_bill_text?") for text_link in text_list.findAll("a", href=text_link_re): text_name = text_link.parent.previousSibling.contents[0] text_name = text_name.strip() text_url = "http://www.legis.state.ak.us/basis/%s" % (text_link["href"]) bill.add_version(text_name, text_url) self.add_bill(bill)
def scrape_session(self, chamber, year): if chamber == 'upper': bill_abbr = 'SB|SCR|SJR' elif chamber == 'lower': bill_abbr = 'HB|HCR|HJR' # Sessions last 2 years, 1993-1994 was the 18th session = str(18 + ((int(year) - 1993) / 2)) year2 = str(int(year) + 1) # Full calendar year date1 = '0101' + year[2:] date2 = '1231' + year2[2:] # Get bill list bill_list_url = 'http://www.legis.state.ak.us/'\ 'basis/range_multi.asp?session=%s&date1=%s&date2=%s' % ( session, date1, date2) self.log("Getting bill list for %s %s (this may take a long time)." % (chamber, session)) bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Find bill links re_str = "bill=%s\d+" % bill_abbr links = bill_list.findAll(href=re.compile(re_str)) for link in links: bill_id = link.contents[0].replace(' ', '') bill_name = link.parent.parent.findNext('td').find( 'font').contents[0].strip() bill = Bill(session, chamber, bill_id, bill_name.strip()) # Get the bill info page and strip malformed t info_url = "http://www.legis.state.ak.us/basis/%s" % link['href'] info_page = self.soup_parser(self.urlopen(info_url)) bill.add_source(info_url) # Get sponsors spons_str = info_page.find( text="SPONSOR(s):").parent.parent.contents[1] sponsors_match = re.match( ' (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') bill.add_sponsor('primary', sponsors[0].strip()) for sponsor in sponsors[1:]: bill.add_sponsor('cosponsor', sponsor.strip()) else: # Committee sponsorship bill.add_sponsor('committee', spons_str.strip()) # Get actions act_rows = info_page.findAll('table', 'myth')[1].findAll('tr')[1:] for row in act_rows: cols = row.findAll('td') act_date = cols[0].font.contents[0] act_date = dt.datetime.strptime(act_date, '%m/%d/%y') if cols[2].font.string == "(H)": act_chamber = "lower" elif cols[2].font.string == "(S)": act_chamber = "upper" else: act_chamber = chamber action = cols[3].font.contents[0].strip() if re.match("\w+ Y(\d+) N(\d+)", action): try: vote = self.parse_vote(bill, action, act_chamber, act_date, cols[1].a['href']) bill.add_vote(vote) except: self.log("Failed parsing vote at %s" % cols[1].a['href']) bill.add_action(act_chamber, action, act_date) # Get subjects bill['subjects'] = [] subject_link_re = re.compile('.*subject=\w+$') for subject_link in info_page.findAll('a', href=subject_link_re): subject = subject_link.contents[0].strip() bill['subjects'].append(subject) # Get versions text_list_url = "http://www.legis.state.ak.us/"\ "basis/get_fulltext.asp?session=%s&bill=%s" % ( session, bill_id) text_list = self.soup_parser(self.urlopen(text_list_url)) bill.add_source(text_list_url) text_link_re = re.compile('^get_bill_text?') for text_link in text_list.findAll('a', href=text_link_re): text_name = text_link.parent.previousSibling.contents[0] text_name = text_name.strip() text_url = "http://www.legis.state.ak.us/basis/%s" % ( text_link['href']) bill.add_version(text_name, text_url) self.save_bill(bill)
def scrape_bill(self, chamber, current_bill, session): other_chamber = 'upper' if chamber == 'lower' else 'lower' with self.soup_context("http://alisondb.legislature.state.al.us/acas/SESSBillsStatusResultsMac.asp?BillNumber=%s&GetStatus=Get+Status&session=%s" % (current_bill, session[0])) as bill: if "Your ACAS Session has expired." in str(bill): raise Exception("Expired cookie - you'll have to run with -n to skip caching") try: bill_id = int(re.findall(r'BTN([0-9]+)', str(bill))[0]) except: raise Exception("No bill found. Hopefully that means it's the end of the session") title = bill.find("td", {'colspan': '7'}).string self.log("Starting parse of %s" % current_bill) #create our bill! bill = Bill(session[1], chamber, current_bill, title.strip()) #add sponsors and co-sponsors with self.soup_context("http://alisondb.legislature.state.al.us/acas/ACTIONSponsorsResultsMac.asp?OID=%d" % bill_id) as sponsors: # This pains me. (primary,secondary) = sponsors.findAll("table", text="Co-Sponsors")[0].parent.parent.parent.findAll('table') for p in primary.findAll('td'): bill.add_sponsor('primary', p.string) for s in secondary.findAll('td'): bill.add_sponsor('cosponsor', s.string) with self.soup_context("http://alisondb.legislature.state.al.us/acas/ACTIONHistoryResultsMac.asp?OID=%d" % bill_id) as history: actions = history.findAll('table', text="Committee")[0].parent.parent.parent.findAll('tr') #Date Amend/Subst Matter Committee Nay Yea Abs Vote for event in actions: e = event.findAll('td') if len(e) == 0: continue date = e[0].string amend = e[1].find('input') matter = e[2].string y_votes = e[5].string n_votes = e[4].string a_votes = e[6].string if not matter: continue roll = e[7].find('input') #(date, amend, matter, committee, nays, yeas, abs, vote_thing) = map(lambda x: x.string, e) if date != None: act_date = dt.datetime.strptime(date, '%m/%d/%Y') if amend != None: splitter = re.findall(r'documentSelected\(\'(\w*)\',\'([\w\d-]*)\',\'([\w\.\-]*)\',\'([\w\d/]*)\',\'([\w\d]*)\',\'([\w\s]*)\'', str(amend))[0] amend = "http://alisondb.legislature.state.al.us/acas/%s/%s" % (splitter[3], splitter[2]) bill.add_document(matter, amend) if roll != None: splitter = re.findall(r'voteSelected\(\'(\d*)\',\'(\d*)\',\'(\d*)\',\'(.*)\',\'(\d*)\'',str(roll))[0] roll = "http://alisondb.legislature.state.al.us/acas/GetRollCallVoteResults.asp?MOID=%s&VOTE=%s&BODY=%s&SESS=%s" % (splitter[0], splitter[1], splitter[2], splitter[4]) with self.soup_context(roll) as votes: vote_rows = votes.findAll('table', text='Member')[0].parent.parent.parent.findAll('tr') yea_votes = int(votes.findAll('tr', text='Total Yea:')[0].parent.parent.findAll('td')[2].string) nay_votes = int(votes.findAll('tr', text='Total Nay:')[0].parent.parent.findAll('td')[2].string) abs_votes = int(votes.findAll('tr', text='Total Abs:')[0].parent.parent.findAll('td')[2].string) p_votes = len(votes.findAll('tr', text='P')) #chamber, date, motion, passed, yes_count, no_count, other_count vote = Vote(chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes + p_votes) vote.add_source(roll) for row in vote_rows: skip = str(row) if "Total Yea" in skip or "Total Nay" in skip or "Total Abs" in skip: continue html_layouts_are_awesome = row.findAll('td') if len(html_layouts_are_awesome) == 0: continue (name, t) = html_layouts_are_awesome[0].string, html_layouts_are_awesome[2].string self.dumb_vote(vote, name, t) if len(html_layouts_are_awesome) > 3: (name, t) = html_layouts_are_awesome[4].string, html_layouts_are_awesome[6].string self.dumb_vote(vote, name, t) bill.add_vote(vote) if y_votes != None: yea_votes = self.dumber_vote(y_votes) nay_votes = self.dumber_vote(n_votes) abs_votes = self.dumber_vote(a_votes) vote = Vote(chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes) bill.add_vote(vote) bill.add_action(chamber, matter, act_date) self.add_bill(bill)
def scrape_new_session(self, chamber, session): """ Scrapes SD's bill data from 2009 on. """ if chamber == 'upper': bill_abbr = 'SB' elif chamber == 'lower': bill_abbr = 'HB' # Get bill list page session_url = 'http://legis.state.sd.us/sessions/%s/' % session bill_list_url = session_url + 'BillList.aspx' self.log('Getting bill list for %s %s' % (chamber, session)) bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Format of bill link contents bill_re = re.compile(u'%s\xa0(\d+)' % bill_abbr) date_re = re.compile('\d{2}/\d{2}/\d{4}') for bill_link in bill_list.findAll('a'): if len(bill_link.contents) == 0: # Empty link continue #print bill_link.contents[0] bill_match = bill_re.search(bill_link.contents[0]) if not bill_match: continue # Parse bill ID and name bill_id = bill_link.contents[0].replace(u'\xa0', ' ') bill_name = bill_link.findNext().contents[0] # Download history page hist_url = session_url + bill_link['href'] history = self.soup_parser(self.urlopen(hist_url)) bill = Bill(session, chamber, bill_id, bill_name) bill.add_source(hist_url) # Get all bill versions text_table = history.findAll('table')[1] for row in text_table.findAll('tr')[2:]: #version_date = row.find('td').string version_path = row.findAll('td')[1].a['href'] version_url = "http://legis.state.sd.us/sessions/%s/%s" % ( session, version_path) version_name = row.findAll('td')[1].a.contents[0].strip() bill.add_version(version_name, version_url) # Get actions act_table = history.find('table') for act_row in act_table.findAll('tr')[6:]: if act_row.find(text='Action'): continue # Get the date (if can't find one then this isn't an action) date_match = date_re.match(act_row.td.a.contents[0]) if not date_match: continue act_date = date_match.group(0) act_date = dt.datetime.strptime(act_date, "%m/%d/%Y") # Get the action string action = "" for node in act_row.findAll('td')[1].contents: if hasattr(node, 'contents'): action += node.contents[0] if node.contents[0].startswith('YEAS'): # This is a vote! vote_url = "http://legis.state.sd.us/sessions/"\ "%s/%s" % (session, node['href']) vote = self.scrape_new_vote(vote_url) vote['date'] = act_date bill.add_vote(vote) else: action += node action = action.strip() # Add action bill.add_action(chamber, action, act_date) self.save_bill(bill)
def scrape_old_session(self, chamber, session): """ Scrape SD's bill data from 1997 through 2008. """ if chamber == 'upper': bill_abbr = 'SB' else: bill_abbr = 'HB' # Get bill list page (and replace malformed tags that some versions of # BeautifulSoup choke on) session_url = 'http://legis.state.sd.us/sessions/%s/' % session bill_list_url = session_url + 'billlist.htm' bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Bill and text link formats bill_re = re.compile('%s (\d+)' % bill_abbr) text_re = re.compile('/sessions/%s/bills/%s.*\.htm' % ( session, bill_abbr), re.IGNORECASE) date_re = re.compile('\d{2}/\d{2}/\d{4}') for bill_link in bill_list.findAll('a', href=re.compile('\d\.htm$')): if len(bill_link.contents) == 0: # Empty link continue bill_match = bill_re.match(bill_link.contents[0]) if not bill_match: # Not bill link continue # Get the bill ID and name bill_id = bill_link.contents[0] bill_name = bill_link.findNext().contents[0] # Get history page (replacing malformed tag) hist_url = session_url + bill_link['href'] history = self.soup_parser(self.urlopen(hist_url)) # Get URL of latest verion of bill (should be listed last) bill_url = history.findAll('a', href=text_re)[-1]['href'] bill_url = 'http://legis.state.sd.us%s' % bill_url # Add bill bill = Bill(session, chamber, bill_id, bill_name) bill.add_source(hist_url) # Get bill versions text_table = history.findAll('table')[1] for row in text_table.findAll('tr')[2:]: #version_date = row.find('td').string version_path = row.findAll('td')[1].a['href'] version_url = "http://legis.state.sd.us" + version_path version_name = row.findAll('td')[1].a.contents[0].strip() bill.add_version(version_name, version_url) # Get actions act_table = history.find('table') for act_row in act_table.findAll('tr')[6:]: if act_row.find(text="Action"): continue # Get the date (if can't find one then this isn't an action) date_match = date_re.match(act_row.td.a.contents[0]) if not date_match: continue act_date = date_match.group(0) act_date = dt.datetime.strptime(act_date, "%m/%d/%Y") # Get the action string action = "" for node in act_row.findAll('td')[1].contents: if hasattr(node, 'contents'): action += node.contents[0] if node.contents[0].startswith('YEAS'): # This is a vote! if node['href'][0] == '/': vote_url = "http://legis.state.sd.us/%s" % ( node['href']) else: vote_url = "http://legis.state.sd.us/"\ "sessions/%s/%s" % (session, node['href']) vote = self.scrape_old_vote(vote_url) vote['date'] = act_date bill.add_vote(vote) else: action += node action = action.strip() # Add action bill.add_action(chamber, action, act_date) self.save_bill(bill)
def scrape_bills(self, chamber, year): session = "%s%d" % (year, int(year) + 1) if not session in self.metadata['sessions']: raise NoDataForYear(year) if chamber == 'upper': measure_abbr = 'SB' chamber_name = 'SENATE' house_type = 'S' else: measure_abbr = 'AB' chamber_name = 'ASSEMBLY' house_type = 'A' bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=measure_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id version = self.session.query(CABillVersion).filter_by( bill=bill).filter(CABillVersion.bill_xml != None).first() if not version: # not enough data to import continue fsbill = Bill(bill_session, chamber, bill_id, version.title, short_title=version.short_title) for author in version.authors: if author.house == chamber_name: fsbill.add_sponsor(author.contribution, author.name) for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber fsbill.add_action(actor, action.action, action.action_date) for vote in bill.votes: if vote.vote_result == '(PASS)': result = True else: result = False full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' vote_location = ' '.join(full_loc.split(' ')[1:]) else: vote_chamber = '' vote_location = full_loc fsvote = Vote(vote_chamber, vote.vote_date_time, vote.motion.motion_text or '', result, vote.ayes, vote.noes, vote.abstain, threshold=vote.threshold, location=vote_location) for record in vote.votes: if record.vote_code == 'AYE': fsvote.yes(record.legislator_name) elif record.vote_code.startswith('NO'): fsvote.no(record.legislator_name) else: fsvote.other(record.legislator_name) fsbill.add_vote(fsvote) self.add_bill(fsbill)