def scrape_bill(self, chamber, current_bill, session): other_chamber = "upper" if chamber == "lower" else "lower" with self.soup_context( "http://alisondb.legislature.state.al.us/acas/SESSBillsStatusResultsMac.asp?BillNumber=%s&GetStatus=Get+Status&session=%s" % (current_bill, session[0]) ) as bill: if "Your ACAS Session has expired." in str(bill): raise Exception("Expired cookie - you'll have to run with -n to skip caching") try: bill_id = int(re.findall(r"BTN([0-9]+)", str(bill))[0]) except: raise Exception("No bill found. Hopefully that means it's the end of the session") title = bill.find("td", {"colspan": "7"}).string self.log("Starting parse of %s" % current_bill) # create our bill! bill = Bill(session[1], chamber, current_bill, title.strip()) # add sponsors and co-sponsors with self.soup_context( "http://alisondb.legislature.state.al.us/acas/ACTIONSponsorsResultsMac.asp?OID=%d" % bill_id ) as sponsors: # This pains me. (primary, secondary) = sponsors.findAll("table", text="Co-Sponsors")[0].parent.parent.parent.findAll( "table" ) for p in primary.findAll("td"): bill.add_sponsor("primary", p.string) for s in secondary.findAll("td"): bill.add_sponsor("cosponsor", s.string) with self.soup_context( "http://alisondb.legislature.state.al.us/acas/ACTIONHistoryResultsMac.asp?OID=%d" % bill_id ) as history: actions = history.findAll("table", text="Committee")[0].parent.parent.parent.findAll("tr") # Date Amend/Subst Matter Committee Nay Yea Abs Vote for event in actions: e = event.findAll("td") if len(e) == 0: continue date = e[0].string amend = e[1].find("input") matter = e[2].string y_votes = e[5].string n_votes = e[4].string a_votes = e[6].string if not matter: continue roll = e[7].find("input") # (date, amend, matter, committee, nays, yeas, abs, vote_thing) = map(lambda x: x.string, e) if date != None: act_date = dt.datetime.strptime(date, "%m/%d/%Y") if amend != None: splitter = re.findall( r"documentSelected\(\'(\w*)\',\'([\w\d-]*)\',\'([\w\.\-]*)\',\'([\w\d/]*)\',\'([\w\d]*)\',\'([\w\s]*)\'", str(amend), )[0] amend = "http://alisondb.legislature.state.al.us/acas/%s/%s" % (splitter[3], splitter[2]) bill.add_document(matter, amend) if roll != None: splitter = re.findall( r"voteSelected\(\'(\d*)\',\'(\d*)\',\'(\d*)\',\'(.*)\',\'(\d*)\'", str(roll) )[0] roll = ( "http://alisondb.legislature.state.al.us/acas/GetRollCallVoteResults.asp?MOID=%s&VOTE=%s&BODY=%s&SESS=%s" % (splitter[0], splitter[1], splitter[2], splitter[4]) ) with self.soup_context(roll) as votes: vote_rows = votes.findAll("table", text="Member")[0].parent.parent.parent.findAll("tr") yea_votes = int( votes.findAll("tr", text="Total Yea:")[0].parent.parent.findAll("td")[2].string ) nay_votes = int( votes.findAll("tr", text="Total Nay:")[0].parent.parent.findAll("td")[2].string ) abs_votes = int( votes.findAll("tr", text="Total Abs:")[0].parent.parent.findAll("td")[2].string ) p_votes = len(votes.findAll("tr", text="P")) # chamber, date, motion, passed, yes_count, no_count, other_count vote = Vote( chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes + p_votes, ) vote.add_source(roll) for row in vote_rows: skip = str(row) if "Total Yea" in skip or "Total Nay" in skip or "Total Abs" in skip: continue html_layouts_are_awesome = row.findAll("td") if len(html_layouts_are_awesome) == 0: continue (name, t) = html_layouts_are_awesome[0].string, html_layouts_are_awesome[2].string self.dumb_vote(vote, name, t) if len(html_layouts_are_awesome) > 3: (name, t) = html_layouts_are_awesome[4].string, html_layouts_are_awesome[6].string self.dumb_vote(vote, name, t) bill.add_vote(vote) if y_votes != None: yea_votes = self.dumber_vote(y_votes) nay_votes = self.dumber_vote(n_votes) abs_votes = self.dumber_vote(a_votes) vote = Vote(chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes) bill.add_vote(vote) bill.add_action(chamber, matter, act_date) self.save_bill(bill)
def scrape_new_session(self, chamber, session): """ Scrapes SD's bill data from 2009 on. """ if chamber == 'upper': bill_abbr = 'SB' elif chamber == 'lower': bill_abbr = 'HB' # Get bill list page session_url = 'http://legis.state.sd.us/sessions/%s/' % session bill_list_url = session_url + 'BillList.aspx' self.log('Getting bill list for %s %s' % (chamber, session)) bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Format of bill link contents bill_re = re.compile(u'%s\xa0(\d+)' % bill_abbr) date_re = re.compile('\d{2}/\d{2}/\d{4}') for bill_link in bill_list.findAll('a'): if len(bill_link.contents) == 0: # Empty link continue #print bill_link.contents[0] bill_match = bill_re.search(bill_link.contents[0]) if not bill_match: continue # Parse bill ID and name bill_id = bill_link.contents[0].replace(u'\xa0', ' ') bill_name = bill_link.findNext().contents[0] # Download history page hist_url = session_url + bill_link['href'] history = self.soup_parser(self.urlopen(hist_url)) bill = Bill(session, chamber, bill_id, bill_name) bill.add_source(hist_url) # Get all bill versions text_table = history.findAll('table')[1] for row in text_table.findAll('tr')[2:]: #version_date = row.find('td').string version_path = row.findAll('td')[1].a['href'] version_url = "http://legis.state.sd.us/sessions/%s/%s" % ( session, version_path) version_name = row.findAll('td')[1].a.contents[0].strip() bill.add_version(version_name, version_url) # Get actions act_table = history.find('table') for act_row in act_table.findAll('tr')[6:]: if act_row.find(text='Action'): continue # Get the date (if can't find one then this isn't an action) date_match = date_re.match(act_row.td.a.contents[0]) if not date_match: continue act_date = date_match.group(0) act_date = dt.datetime.strptime(act_date, "%m/%d/%Y") # Get the action string action = "" for node in act_row.findAll('td')[1].contents: if hasattr(node, 'contents'): action += node.contents[0] if node.contents[0].startswith('YEAS'): # This is a vote! vote_url = "http://legis.state.sd.us/sessions/"\ "%s/%s" % (session, node['href']) vote = self.scrape_new_vote(vote_url) vote['date'] = act_date bill.add_vote(vote) else: action += node action = action.strip() # Add action bill.add_action(chamber, action, act_date) self.save_bill(bill)
def scrape_bills(self,chamber,year): if int(year) %2 == 0: raise NoDataForPeriod(year) # year = int(year) oyear = year #save off the original of the session if chamber == 'upper': bill_no = 1 abbr = 'SB' else: bill_no = 4001 abbr = 'HB' while True: (bill_page,year) = self.scrape_bill(year, abbr, bill_no) # if we can't find a page, we must be done. This is a healthy thing. if bill_page == None: return title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0])) title = title.replace('\n','').replace('\r','') bill_id = "%s %d" % (abbr, bill_no) the_bill = Bill("%d" % oyear, chamber, bill_id, title) #sponsors first = 0 for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'): the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string) first = 1 #versions for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_version(*r) #documents if 'frg_billstatus_HlaTable' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) if 'frg_billstatus_SfaSection' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) the_bill.add_source('http://legislature.mi.gov/doc.aspx?%d-%s-%04d' % (year, abbr, bill_no)) self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0]) self.save_bill(the_bill) bill_no = bill_no + 1 pass
def scrape_old_session(self, chamber, session): """ Scrape SD's bill data from 1997 through 2008. """ if chamber == 'upper': bill_abbr = 'SB' else: bill_abbr = 'HB' # Get bill list page (and replace malformed tags that some versions of # BeautifulSoup choke on) session_url = 'http://legis.state.sd.us/sessions/%s/' % session bill_list_url = session_url + 'billlist.htm' bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Bill and text link formats bill_re = re.compile('%s (\d+)' % bill_abbr) text_re = re.compile('/sessions/%s/bills/%s.*\.htm' % ( session, bill_abbr), re.IGNORECASE) date_re = re.compile('\d{2}/\d{2}/\d{4}') for bill_link in bill_list.findAll('a', href=re.compile('\d\.htm$')): if len(bill_link.contents) == 0: # Empty link continue bill_match = bill_re.match(bill_link.contents[0]) if not bill_match: # Not bill link continue # Get the bill ID and name bill_id = bill_link.contents[0] bill_name = bill_link.findNext().contents[0] # Get history page (replacing malformed tag) hist_url = session_url + bill_link['href'] history = self.soup_parser(self.urlopen(hist_url)) # Get URL of latest verion of bill (should be listed last) bill_url = history.findAll('a', href=text_re)[-1]['href'] bill_url = 'http://legis.state.sd.us%s' % bill_url # Add bill bill = Bill(session, chamber, bill_id, bill_name) bill.add_source(hist_url) # Get bill versions text_table = history.findAll('table')[1] for row in text_table.findAll('tr')[2:]: #version_date = row.find('td').string version_path = row.findAll('td')[1].a['href'] version_url = "http://legis.state.sd.us" + version_path version_name = row.findAll('td')[1].a.contents[0].strip() bill.add_version(version_name, version_url) # Get actions act_table = history.find('table') for act_row in act_table.findAll('tr')[6:]: if act_row.find(text="Action"): continue # Get the date (if can't find one then this isn't an action) date_match = date_re.match(act_row.td.a.contents[0]) if not date_match: continue act_date = date_match.group(0) act_date = dt.datetime.strptime(act_date, "%m/%d/%Y") # Get the action string action = "" for node in act_row.findAll('td')[1].contents: if hasattr(node, 'contents'): action += node.contents[0] if node.contents[0].startswith('YEAS'): # This is a vote! if node['href'][0] == '/': vote_url = "http://legis.state.sd.us/%s" % ( node['href']) else: vote_url = "http://legis.state.sd.us/"\ "sessions/%s/%s" % (session, node['href']) vote = self.scrape_old_vote(vote_url) vote['date'] = act_date bill.add_vote(vote) else: action += node action = action.strip() # Add action bill.add_action(chamber, action, act_date) self.save_bill(bill)