def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': other_chamber = 'lower' bill_id = 'SB 1' else: other_chamber = 'upper' bill_id = 'HB 1' b1 = Bill(session, chamber, bill_id, 'A super bill') b1.add_source('http://example.com/') b1.add_version('As Introduced', 'http://example.com/SB1.html') b1.add_document('Google', 'http://google.com') b1.add_sponsor('primary', 'Bob Smith') b1.add_sponsor('secondary', 'Johnson, Sally') d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y') v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0) v1.yes('Smith') v1.yes('Johnson') d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y') v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1) v2.no('Bob Smith') v2.other('S. Johnson') b1.add_vote(v1) b1.add_vote(v2) b1.add_action(chamber, 'introduced', d1) b1.add_action(chamber, 'read first time', d2) b1.add_action(other_chamber, 'introduced', d2) self.save_bill(b1)
def scrape(self, chamber, year): session = "%s%d" % (year, int(year) + 1) if session not in [s_ for t in metadata['terms'] for s_ in t['sessions']]: raise NoDataForPeriod(year) if chamber == 'upper': measure_abbr = 'SB' chamber_name = 'SENATE' house_type = 'S' else: measure_abbr = 'AB' chamber_name = 'ASSEMBLY' house_type = 'A' bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=measure_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id version = self.session.query(CABillVersion).filter_by( bill=bill).filter(CABillVersion.bill_xml != None).first() if not version: # not enough data to import continue fsbill = Bill(bill_session, chamber, bill_id, version.title, short_title=version.short_title) for author in version.authors: if author.house == chamber_name: fsbill.add_sponsor(author.contribution, author.name) for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: actor = re.sub('^Assembly', 'lower', actor) actor = re.sub('^Senate', 'upper', actor) type = [] act_str = action.action if act_str.startswith('Introduced'): type.append('bill:introduced') if 'To Com' in act_str: type.append('committee:referred') if 'Read third time. Passed.' in act_str: type.append('bill:passed') if 'Approved by Governor' in act_str: type.append('bill:signed') if 'Item veto' in act_str: type.append('veto:line-item') if not type: type = ['other'] fsbill.add_action(actor, act_str, action.action_date, type=type) for vote in bill.votes: if vote.vote_result == '(PASS)': result = True else: result = False full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' vote_location = ' '.join(full_loc.split(' ')[1:]) else: vote_chamber = '' vote_location = full_loc fsvote = Vote(vote_chamber, vote.vote_date_time, vote.motion.motion_text or '', result, vote.ayes, vote.noes, vote.abstain, threshold=vote.threshold, location=vote_location) for record in vote.votes: if record.vote_code == 'AYE': fsvote.yes(record.legislator_name) elif record.vote_code.startswith('NO'): fsvote.no(record.legislator_name) else: fsvote.other(record.legislator_name) fsbill.add_vote(fsvote) self.save_bill(fsbill)
def scrape_session(self, chamber, year): if chamber == 'upper': bill_abbr = 'SB|SCR|SJR' elif chamber == 'lower': bill_abbr = 'HB|HCR|HJR' # Sessions last 2 years, 1993-1994 was the 18th session = str(18 + ((int(year) - 1993) / 2)) year2 = str(int(year) + 1) # Full calendar year date1 = '0101' + year[2:] date2 = '1231' + year2[2:] # Get bill list bill_list_url = 'http://www.legis.state.ak.us/'\ 'basis/range_multi.asp?session=%s&date1=%s&date2=%s' % ( session, date1, date2) self.log("Getting bill list for %s %s (this may take a long time)." % (chamber, session)) bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Find bill links re_str = "bill=%s\d+" % bill_abbr links = bill_list.findAll(href=re.compile(re_str)) for link in links: bill_id = link.contents[0].replace(' ', '') bill_name = link.parent.parent.findNext('td').find( 'font').contents[0].strip() bill = Bill(session, chamber, bill_id, bill_name.strip()) # Get the bill info page and strip malformed t info_url = "http://www.legis.state.ak.us/basis/%s" % link['href'] info_page = self.soup_parser(self.urlopen(info_url)) bill.add_source(info_url) # Get sponsors spons_str = info_page.find( text="SPONSOR(s):").parent.parent.contents[1] sponsors_match = re.match( ' (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') bill.add_sponsor('primary', sponsors[0].strip()) for sponsor in sponsors[1:]: bill.add_sponsor('cosponsor', sponsor.strip()) else: # Committee sponsorship bill.add_sponsor('committee', spons_str.strip()) # Get actions act_rows = info_page.findAll('table', 'myth')[1].findAll('tr')[1:] for row in act_rows: cols = row.findAll('td') act_date = cols[0].font.contents[0] act_date = dt.datetime.strptime(act_date, '%m/%d/%y') if cols[2].font.string == "(H)": act_chamber = "lower" elif cols[2].font.string == "(S)": act_chamber = "upper" else: act_chamber = chamber action = cols[3].font.contents[0].strip() if re.match("\w+ Y(\d+) N(\d+)", action): try: vote = self.parse_vote(bill, action, act_chamber, act_date, cols[1].a['href']) bill.add_vote(vote) except: self.log("Failed parsing vote at %s" % cols[1].a['href']) bill.add_action(act_chamber, action, act_date) # Get subjects bill['subjects'] = [] subject_link_re = re.compile('.*subject=\w+$') for subject_link in info_page.findAll('a', href=subject_link_re): subject = subject_link.contents[0].strip() bill['subjects'].append(subject) # Get versions text_list_url = "http://www.legis.state.ak.us/"\ "basis/get_fulltext.asp?session=%s&bill=%s" % ( session, bill_id) text_list = self.soup_parser(self.urlopen(text_list_url)) bill.add_source(text_list_url) text_link_re = re.compile('^get_bill_text?') for text_link in text_list.findAll('a', href=text_link_re): text_name = text_link.parent.previousSibling.contents[0] text_name = text_name.strip() text_url = "http://www.legis.state.ak.us/basis/%s" % ( text_link['href']) bill.add_version(text_name, text_url) self.save_bill(bill)
def scrape_bill_pages(self, session, year_abr): """ assemble information on a bill from a number of DBF files """ #Main Bill information main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) bill.add_source(main_bill_url) bill_dict[bill_id] = bill #Sponsors bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON') for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == 'P': sponsor_type = "Primary" else: sponsor_type = "Co-sponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP') #print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] document = rec["document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype doc_name = self._doctypes[rec['doctype']] if rec['comment']: doc_name += ' ' + rec['comment'] if rec['doctype'] in self._version_types: bill.add_version(doc_name, htm_url) else: bill.add_document(doc_name, htm_url) #Senate Votes file1 = 'A' + str(year_abr) file2 = 'A' + str(year_abr + 1) file3 = 'S' + str(year_abr) file4 = 'S' + str(year_abr + 1) if str(year_abr) != '2010': vote_info_list = [file1, file2, file3, file4] else: vote_info_list = [file1, file3] for bill_vote_file in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % bill_vote_file s_vote_zip, resp = self.urlretrieve(s_vote_url) zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % bill_vote_file vote_file = zipedfile.open(vfile, 'U') vdict_file = csv.DictReader(vote_file) votes = {} if bill_vote_file[0] == "A": chamber = "lower" else: chamber = "upper" for rec in vdict_file: bill_id = rec["Bill"] bill_id = bill_id.strip() leg = rec["Full_Name"] date = rec["Session_Date"] date = datetime.strptime(date, "%m/%d/%Y") action = rec["Action"] leg_vote = rec["Legislator_Vote"] vote_id = bill_id + "_" + action vote_id = vote_id.replace(" ", "_") passed = None if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, passed, None, None, None, bill_id=bill_id) if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST') for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = rec["house"] comment = rec["comment"] action, atype = self.categorize_action(action) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ') for rec in subject_db: bill_id = rec['billtype'] + str(int(rec['billnumber'])) bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['subjectkey']) else: self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id) # save all bills at the end for bill in bill_dict.itervalues(): # add sources bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) bill.add_source(subject_url) self.save_bill(bill)
def scrape_bill(self, chamber, session, url): url = url + "&Year=%s" % session with self.urlopen(url) as page: page = page.replace(' ', ' ').replace('<br>', '\n') page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath('//h3')[0].text.strip() title = re.match(r"^\w+\s+\d+:\s+(.*)$", title).group(1) bill_id = page.xpath("string(//pre[@class='billhistory']/b)") bill_id = bill_id.split()[0].strip() bill = Bill(session, chamber, bill_id, title) bill.add_source(url) hist = page.xpath("string(//pre[@class='billhistory'])").strip() act_re = re.compile(r'^ (\d\d/\d\d/\d\d) (SENATE|HOUSE)' r'(.*\n(\s{16,16}.*\n){0,})', re.MULTILINE) # Actions for match in act_re.finditer(hist): action = match.group(3).replace('\n', ' ') action = re.sub(r'\s+', ' ', action).strip() if match.group(2) == 'SENATE': actor = 'upper' else: actor = 'lower' date = match.group(1) date = datetime.datetime.strptime(date, "%m/%d/%y") for act_text in re.split(' -[HS]J \d+;? ?', action): act_text = act_text.strip() if not act_text: continue types = [] act_lower = act_text.lower() if act_lower.startswith('introduced'): types.append('bill:introduced') if 'referred to' in act_lower: types.append('committee:referred') if 'died in committee' in act_lower: types.append('committee:failed') if 'favorable by' in act_lower: types.append('committee:passed:favorable') if 'amendment(s) adopted' in act_lower: types.append('amendment:passed') bill.add_action(actor, act_text, date, type=types) # Sponsors primary_sponsor = re.search(r'by ([^;(\n]+;?|\w+)', hist).group(1).strip('; ') bill.add_sponsor('primary', primary_sponsor) cospon_re = re.compile(r'\((CO-SPONSORS|CO-AUTHORS)\) ' '([\w .]+(;[\w .\n]+){0,})', re.MULTILINE) match = cospon_re.search(hist) if match: for cosponsor in match.group(2).split(';'): cosponsor = cosponsor.replace('\n', '').strip() bill.add_sponsor('cosponsor', cosponsor) # Versions for link in page.xpath("//a[contains(@href, 'billtext/html')]"): version = link.xpath('string(../../td[1])').strip() bill.add_version(version, link.attrib['href']) # House Votes for link in page.xpath("//a[contains(@href, 'votes/html/h')]"): bill.add_vote(self.scrape_lower_vote(link.attrib['href'])) # Senate Votes for link in page.xpath("//a[contains(@href, 'votes/html/S')]"): bill.add_vote(self.scrape_upper_vote(link.attrib['href'])) self.save_bill(bill)
def scrape_bill_pages(self, session, year_abr): #Main Bill information main_bill_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/MAINBILL.DBF' % (year_abr) MAINBILL_dbf, resp = self.urlretrieve(main_bill_url) main_bill_db = dbf.Dbf(MAINBILL_dbf) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" bill = Bill(str(session), chamber, bill_id, title) bill.add_source(main_bill_url) bill_dict[bill_id] = bill #Sponsors bill_sponsors_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/BILLSPON.DBF' % (year_abr) SPONSORS_dbf, resp = self.urlretrieve(bill_sponsors_url) bill_sponsors_db = dbf.Dbf(SPONSORS_dbf) for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == 'P': sponsor_type = "Primary" else: sponsor_type = "Co-sponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/BILLWP.DBF' % (year_abr) DOC_dbf, resp = self.urlretrieve(bill_document_url) bill_document_db = dbf.Dbf(DOC_dbf) #print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] document = rec["document"] document = document.split('\\') doc_name = document[-1] document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) doc_url = "ftp://www.njleg.state.nj.us/%s" % year doc_url = doc_url + "/" + document bill.add_document(doc_name, doc_url) #Senate Votes file1 = 'A' + str(year_abr) file2 = 'A' + str(year_abr + 1) file3 = 'S' + str(year_abr) file4 = 'S' + str(year_abr + 1) if str(year_abr) != '2010': vote_info_list = [file1, file2, file3, file4] else: vote_info_list = [file1, file3] for bill_vote_file in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % bill_vote_file s_vote_zip, resp = self.urlretrieve(s_vote_url) zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % bill_vote_file vote_file = zipedfile.open(vfile, 'U') vdict_file = csv.DictReader(vote_file) votes = {} if bill_vote_file[0] == "A": chamber = "lower" else: chamber = "upper" for rec in vdict_file: bill_id = rec["Bill"] bill_id = bill_id.strip() leg = rec["Full_Name"] date = rec["Session_Date"] date = datetime.strptime(date, "%m/%d/%Y") action = rec["Action"] leg_vote = rec["Legislator_Vote"] vote_id = bill_id + "_" + action vote_id = vote_id.replace(" ", "_") passed = None if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, passed, None, None, None, bill_id = bill_id) if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/BILLHIST.DBF' % (year_abr) ACTION_dbf, resp = self.urlretrieve(bill_action_url) bill_action_db = dbf.Dbf(ACTION_dbf) bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = rec["house"] comment = rec["comment"] bill.add_action(actor, action, date, comment = comment) self.save_bill(bill)
def scrape_new_session(self, chamber, session): """ Scrapes SD's bill data from 2009 on. """ if chamber == 'upper': bill_abbr = 'SB' elif chamber == 'lower': bill_abbr = 'HB' # Get bill list page session_url = 'http://legis.state.sd.us/sessions/%s/' % session bill_list_url = session_url + 'BillList.aspx' self.log('Getting bill list for %s %s' % (chamber, session)) bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Format of bill link contents bill_re = re.compile(u'%s\xa0(\d+)' % bill_abbr) date_re = re.compile('\d{2}/\d{2}/\d{4}') for bill_link in bill_list.findAll('a'): if len(bill_link.contents) == 0: # Empty link continue #print bill_link.contents[0] bill_match = bill_re.search(bill_link.contents[0]) if not bill_match: continue # Parse bill ID and name bill_id = bill_link.contents[0].replace(u'\xa0', ' ') bill_name = bill_link.findNext().contents[0] # Download history page hist_url = session_url + bill_link['href'] history = self.soup_parser(self.urlopen(hist_url)) bill = Bill(session, chamber, bill_id, bill_name) bill.add_source(hist_url) # Get all bill versions text_table = history.findAll('table')[1] for row in text_table.findAll('tr')[2:]: #version_date = row.find('td').string version_path = row.findAll('td')[1].a['href'] version_url = "http://legis.state.sd.us/sessions/%s/%s" % ( session, version_path) version_name = row.findAll('td')[1].a.contents[0].strip() bill.add_version(version_name, version_url) # Get actions act_table = history.find('table') for act_row in act_table.findAll('tr')[6:]: if act_row.find(text='Action'): continue # Get the date (if can't find one then this isn't an action) date_match = date_re.match(act_row.td.a.contents[0]) if not date_match: continue act_date = date_match.group(0) act_date = dt.datetime.strptime(act_date, "%m/%d/%Y") # Get the action string action = "" for node in act_row.findAll('td')[1].contents: if hasattr(node, 'contents'): action += node.contents[0] if node.contents[0].startswith('YEAS'): # This is a vote! vote_url = "http://legis.state.sd.us/sessions/"\ "%s/%s" % (session, node['href']) vote = self.scrape_new_vote(vote_url) vote['date'] = act_date bill.add_vote(vote) else: action += node action = action.strip() # Add action bill.add_action(chamber, action, act_date) self.save_bill(bill)
def scrape_old_session(self, chamber, session): """ Scrape SD's bill data from 1997 through 2008. """ if chamber == 'upper': bill_abbr = 'SB' else: bill_abbr = 'HB' # Get bill list page (and replace malformed tags that some versions of # BeautifulSoup choke on) session_url = 'http://legis.state.sd.us/sessions/%s/' % session bill_list_url = session_url + 'billlist.htm' bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Bill and text link formats bill_re = re.compile('%s (\d+)' % bill_abbr) text_re = re.compile('/sessions/%s/bills/%s.*\.htm' % ( session, bill_abbr), re.IGNORECASE) date_re = re.compile('\d{2}/\d{2}/\d{4}') for bill_link in bill_list.findAll('a', href=re.compile('\d\.htm$')): if len(bill_link.contents) == 0: # Empty link continue bill_match = bill_re.match(bill_link.contents[0]) if not bill_match: # Not bill link continue # Get the bill ID and name bill_id = bill_link.contents[0] bill_name = bill_link.findNext().contents[0] # Get history page (replacing malformed tag) hist_url = session_url + bill_link['href'] history = self.soup_parser(self.urlopen(hist_url)) # Get URL of latest verion of bill (should be listed last) bill_url = history.findAll('a', href=text_re)[-1]['href'] bill_url = 'http://legis.state.sd.us%s' % bill_url # Add bill bill = Bill(session, chamber, bill_id, bill_name) bill.add_source(hist_url) # Get bill versions text_table = history.findAll('table')[1] for row in text_table.findAll('tr')[2:]: #version_date = row.find('td').string version_path = row.findAll('td')[1].a['href'] version_url = "http://legis.state.sd.us" + version_path version_name = row.findAll('td')[1].a.contents[0].strip() bill.add_version(version_name, version_url) # Get actions act_table = history.find('table') for act_row in act_table.findAll('tr')[6:]: if act_row.find(text="Action"): continue # Get the date (if can't find one then this isn't an action) date_match = date_re.match(act_row.td.a.contents[0]) if not date_match: continue act_date = date_match.group(0) act_date = dt.datetime.strptime(act_date, "%m/%d/%Y") # Get the action string action = "" for node in act_row.findAll('td')[1].contents: if hasattr(node, 'contents'): action += node.contents[0] if node.contents[0].startswith('YEAS'): # This is a vote! if node['href'][0] == '/': vote_url = "http://legis.state.sd.us/%s" % ( node['href']) else: vote_url = "http://legis.state.sd.us/"\ "sessions/%s/%s" % (session, node['href']) vote = self.scrape_old_vote(vote_url) vote['date'] = act_date bill.add_vote(vote) else: action += node action = action.strip() # Add action bill.add_action(chamber, action, act_date) self.save_bill(bill)
def scrape_bill_type(self, chamber, session, bill_type, type_abbr): if chamber == "upper": chamber_name = "SENATE" else: chamber_name = "ASSEMBLY" bills = self.session.query(CABill).filter_by(session_year=session).filter_by(measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != "0": bill_session += " Special Session %s" % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_session, chamber, bill_id, "") # Construct session for web query, going from '20092010' to '0910' source_session = session[2:4] + session[6:8] # Turn 'AB 10' into 'ab_10' source_num = "%s_%s" % (bill.measure_type.lower(), bill.measure_num) # Construct a fake source url source_url = "http://www.leginfo.ca.gov/cgi-bin/postquery?" "bill_number=%s&sess=%s" % ( source_num, source_session, ) fsbill.add_source(source_url) title = "" short_title = "" type = ["bill"] subject = "" for version in ( self.session.query(CABillVersion).filter_by(bill=bill).filter(CABillVersion.bill_xml != None) ): title = version.title short_title = version.short_title type = [bill_type] if version.appropriation == "Yes": type.append("appropriation") if version.fiscal_committee == "Yes": type.append("fiscal committee") if version.local_program == "Yes": type.append("local program") if version.urgency == "Yes": type.append("urgency") if version.taxlevy == "Yes": type.append("tax levy") subject = version.subject fsbill.add_version( version.bill_version_id, "", date=version.bill_version_action_date.date(), title=version.title, short_title=version.short_title, subject=[subject], type=type, ) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill["title"] = title fsbill["short_title"] = short_title fsbill["type"] = type fsbill["subjects"] = [subject] for author in version.authors: if author.house == chamber_name: fsbill.add_sponsor(author.contribution, author.name) for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r"(Assembly|Senate)($| \(Floor)", actor) if match: actor = {"Assembly": "lower", "Senate": "upper"}[match.group(1)] elif actor.startswith("Governor"): actor = "executive" else: actor = re.sub("^Assembly", "lower", actor) actor = re.sub("^Senate", "upper", actor) type = [] act_str = action.action if act_str.startswith("Introduced"): type.append("bill:introduced") if "To Com" in act_str: type.append("committee:referred") if "Read third time. Passed." in act_str: type.append("bill:passed") if "Approved by Governor" in act_str: type.append("governor:signed") if "Item veto" in act_str: type.append("governor:vetoed:line-item") if not type: type = ["other"] fsbill.add_action(actor, act_str, action.action_date.date(), type=type) for vote in bill.votes: if vote.vote_result == "(PASS)": result = True else: result = False full_loc = vote.location.description first_part = full_loc.split(" ")[0].lower() if first_part in ["asm", "assembly"]: vote_chamber = "lower" vote_location = " ".join(full_loc.split(" ")[1:]) elif first_part.startswith("sen"): vote_chamber = "upper" vote_location = " ".join(full_loc.split(" ")[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) motion = vote.motion.motion_text or "" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r"(\w+)( Extraordinary)? Session$", re.IGNORECASE).sub("", motion) motion = re.compile(r"^(Senate|Assembly) ", re.IGNORECASE).sub("", motion) motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ", "", motion) motion = re.sub(r" \(\w+\)$", "", motion) motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "", motion) motion = re.sub(r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? " r"Urgency Clause$", "(Urgency Clause)", motion) motion = re.sub(r"\s+", " ", motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue fsvote = Vote( vote_chamber, self._tz.localize(vote.vote_date_time), motion, result, int(vote.ayes), int(vote.noes), int(vote.abstain), threshold=vote.threshold, type=vtype, ) if vote_location != "Floor": fsvote["committee"] = vote_location for record in vote.votes: if record.vote_code == "AYE": fsvote.yes(record.legislator_name) elif record.vote_code.startswith("NO"): fsvote.no(record.legislator_name) else: fsvote.other(record.legislator_name) fsbill.add_vote(fsvote) self.save_bill(fsbill)
def scrape_bill_type(self, chamber, session, bill_type, type_abbr): if chamber == 'upper': chamber_name = 'SENATE' else: chamber_name = 'ASSEMBLY' bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_session, chamber, bill_id, '') title = '' short_title = '' type = ['bill'] subject = '' for version in self.session.query(CABillVersion).filter_by( bill=bill).filter(CABillVersion.bill_xml != None): title = version.title short_title = version.short_title type = [bill_type] if version.appropriation == 'Yes': type.append('appropriation') if version.fiscal_committee == 'Yes': type.append('fiscal committee') if version.local_program == 'Yes': type.append('local program') if version.urgency == 'Yes': type.append('urgency') if version.taxlevy == 'Yes': type.append('tax levy') subject = version.subject fsbill.add_version(version.bill_version_id, '', date=version.bill_version_action_date, title=version.title, short_title=version.short_title, subject=[subject], type=type) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill['title'] = title fsbill['short_title'] = short_title fsbill['type'] = type fsbill['subjects'] = [subject] for author in version.authors: if author.house == chamber_name: fsbill.add_sponsor(author.contribution, author.name) for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: actor = re.sub('^Assembly', 'lower', actor) actor = re.sub('^Senate', 'upper', actor) type = [] act_str = action.action if act_str.startswith('Introduced'): type.append('bill:introduced') if 'To Com' in act_str: type.append('committee:referred') if 'Read third time. Passed.' in act_str: type.append('bill:passed') if 'Approved by Governor' in act_str: type.append('governor:signed') if 'Item veto' in act_str: type.append('governor:vetoed:line-item') if not type: type = ['other'] fsbill.add_action(actor, act_str, action.action_date, type=type) for vote in bill.votes: if vote.vote_result == '(PASS)': result = True else: result = False full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) motion = vote.motion.motion_text or '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue fsvote = Vote(vote_chamber, vote.vote_date_time, motion, result, int(vote.ayes), int(vote.noes), int(vote.abstain), threshold=vote.threshold, type=vtype) if vote_location != 'Floor': fsvote['committee'] = vote_location for record in vote.votes: if record.vote_code == 'AYE': fsvote.yes(record.legislator_name) elif record.vote_code.startswith('NO'): fsvote.no(record.legislator_name) else: fsvote.other(record.legislator_name) fsbill.add_vote(fsvote) self.save_bill(fsbill)
def scrape_bill_info(self, session, ld, session_id, bill_id, title): bill_info_url = 'http://www.mainelegislature.org/LawMakerWeb/summary.asp?LD=%s&SessionID=%s' % (ld, session_id) with self.urlopen(bill_info_url) as bill_sum_page: root = lxml.etree.fromstring(bill_sum_page, lxml.etree.HTMLParser()) sponsor = root.xpath('string(//tr[3]/td[1]/b[1])') if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill = Bill(str(session), chamber, bill_id, title) bill.add_source(bill_info_url) #Actions actions_url_addon = root.xpath('string(//table/tr[3]/td/a/@href)') actions_url = 'http://www.mainelegislature.org/LawMakerWeb/%s' % actions_url_addon bill.add_source(actions_url) with self.urlopen(actions_url) as actions_page: root2 = lxml.etree.fromstring(actions_page, lxml.etree.HTMLParser()) count = 2 for mr in root2.xpath("//td[2]/table[2]/tr[position() > 1]/td[1]"): date = mr.xpath('string()') date = datetime.strptime(date, "%m/%d/%Y") actor_path = "string(//td[2]/table/tr[%s]/td[2])" % count actor = root2.xpath(actor_path) action_path = "string(//td[2]/table/tr[%s]/td[3])" % count action = root2.xpath(action_path) count = count + 1 if actor == "House": actor = "lower" else: actor = "upper" bill.add_action(actor, action, date) #Votes votes_url_addon = root.xpath('string(//table/tr[9]/td/a/@href)') votes_url = 'http://www.mainelegislature.org/LawMakerWeb/%s' % votes_url_addon bill.add_source(votes_url) with self.urlopen(votes_url) as votes_page: vote_root = lxml.etree.fromstring(votes_page, lxml.etree.HTMLParser()) for mr in vote_root.xpath('//table[position() > 1]/tr/td/a'): vote_detail_addon = mr.xpath('string(@href)') vote_detail_url = 'http://www.mainelegislature.org/LawMakerWeb/%s' % vote_detail_addon bill.add_source(vote_detail_url) with self.urlopen(vote_detail_url) as vote_detail_page: detail_root = lxml.etree.fromstring(vote_detail_page, lxml.etree.HTMLParser()) date = detail_root.xpath('string(//table[2]//tr[2]/td[3])') try: date = datetime.strptime(date, "%B %d, %Y") except: date = datetime.strptime(date, "%b. %d, %Y") motion = detail_root.xpath('string(//table[2]//tr[3]/td[3])') passed = detail_root.xpath('string(//table[2]//tr[5]/td[3])') == 'PREVAILS' yes_count = detail_root.xpath('string(//table[2]//tr[6]/td[3])') no_count = detail_root.xpath('string(//table[2]//tr[7]/td[3])') absent_count = detail_root.xpath('string(//table[2]//tr[6]/td[3])') excused_count = detail_root.xpath('string(//table[2]//tr[6]/td[3])') other_count = 0 if votes_url.find('House') != -1: chamber = "lower" else: chamber = "upper" vote = Vote(chamber, date, motion, passed, int(yes_count), int(no_count), other_count, absent_count = int(absent_count), excused_count = int(excused_count)) for member in detail_root.xpath('//table[3]/tr[position() > 1]'): leg = member.xpath('string(td[2])') party = member.xpath('string(td[3])') leg_vote = member.xpath('string(td[4])') if leg_vote == "Y": vote.yes(leg) elif leg_vote == "N": vote.no(leg) else: vote.other(leg) bill.add_vote(vote) self.save_bill(bill)
def scrape_bills(self, chamber_to_scrape, session): url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session with self.urlopen(url) as bill_dir_page: root = lxml.etree.fromstring(bill_dir_page, lxml.etree.HTMLParser()) for mr in root.xpath('//lastaction/msrgroup'): bill_id = mr.xpath('string(measure)').replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill_type = {'B':'bill', 'C': 'concurrent resolution', 'R': 'resolution', 'N': 'nomination'}[bill_id[1]] # just skip past bills that are of the wrong chamber if chamber != chamber_to_scrape: continue link = mr.xpath('string(actionlink)').replace("..", "") main_doc = mr.xpath('string(measurelink)').replace("../../../", "") main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link) with self.urlopen(bill_details_url) as details_page: details_page = details_page.decode('latin1').encode('utf8', 'ignore') details_root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) title = details_root.xpath('string(//shorttitle)') longtitle = details_root.xpath('string(//longtitle)') bill = Bill(session, chamber, bill_id, title, type=bill_type, longtitle=longtitle) #sponsors main_sponsor = details_root.xpath('string(//p_name)').split() if main_sponsor: main_sponsor = main_sponsor[0] main_sponsor_link = details_root.xpath('string(//p_link)').replace(" ", "_") main_sponsor_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link) type = "primary" bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url) for author in details_root.xpath('//authors/additional'): leg = author.xpath('string(co_name)').replace(" ", "_") leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg) type = "cosponsor" bill.add_sponsor(type, leg, leg_url=leg_url) #Versions curr_version = details_root.xpath('string(//current_other)').replace("../../../../", "") curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version bill.add_version("Current version", curr_version_url) intro_version = details_root.xpath('string(//intro_other)').replace("../../../../", "") intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version bill.add_version("As Introduced", intro_version_url) comm_version = details_root.xpath('string(//cmtesub_other)').replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version("Committee Substitute", comm_version_url) passed_version = details_root.xpath('string(//passed_other)').replace("../../../../", "") if passed_version.find("documents") != -1: passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version title = "As Passed the " + chamber bill.add_version(title, passed_version_url) asg_version = details_root.xpath('string(//asg_other)').replace("../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version("Approved by the Governor", asg_version_url) #Actions for action in details_root.xpath('//history/action'): action_num = action.xpath('string(act_number)').strip() action_num = int(action_num) act_vote = action.xpath('string(act_vote)').replace("../../../..", "") action_desc = action.xpath('string(act_desc)') date, action_desc = action_desc.split(" ", 1) date = date + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") if action_desc.startswith("(H)"): actor = "lower" action = action_desc[4:] elif action_desc.startswith("(S)"): actor = "upper" action = action_desc[4:] else: actor = "executive" action = action_desc if action.find("Veto") != -1: version_path = details_root.xpath("string(//veto_other)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document("Veto", version_url) atype = 'other' for prefix, prefix_type in self._action_types: if action.startswith(prefix): atype = prefix_type break bill.add_action(actor, action, date, type=atype, action_num=action_num) if act_vote: vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote vote = self.scrape_votes(vote_url, action, date, actor) bill.add_vote(vote) bill.add_source(vote_url) bill.add_source(bill_details_url) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id): session_id = self.get_session_id(session) url = base_url + 'DocumentsForBill.asp?Bill_Number=%s&Session_ID=%s' % ( bill_id, session_id) with self.urlopen(url) as docs_for_bill: root = html.fromstring(docs_for_bill) bill_title = root.xpath( '//div[@class="ContentPageTitle"]')[1].text.strip() # Depending on the progress the bill has made through the house # some table might not exist, the links that have javascript:Show**** # have a table with related documents/calanders/agendas/versions # I am skipping the sponsors link because that information is on the # bill overview page where all of the actions are found. doc_section_links = root.xpath( '//a[contains(@href, "javascript:Show")]') bill = Bill(session, chamber, bill_id, bill_title) bill.type = self.get_bill_type(bill_id[:-4]) bill.add_source(url) for link in doc_section_links: link_id = utils.parse_link_id(link) link_text = link.text_content().strip() div_path = '//div[@id="%s"]/table//tr' % link_id if link_text == 'Show Versions': # the first row has only a comment for tr in root.xpath(div_path)[1:]: tds = tr.cssselect('td') # list(tr.iterchildren('td')) if len(tds) >= 4: bill_version = tds[1].text_content().strip() bill_html = tds[2].xpath('string(font/a/@href)') bill_pdf = tds[3].xpath('string(font/a/@href)') bill.add_version(bill_version, bill_html, pdf_url=bill_pdf) elif link_text == 'Show Summaries/Fact Sheets': for tr in root.xpath(div_path)[1:]: # the first row has only a comment tds = tr.cssselect('td') if len(tds) > 1: fact_sheet = tds[1].text_content().strip() fact_sheet_url = tds[1].xpath( 'string(font/a/@href)') bill.add_document(fact_sheet, fact_sheet_url, type="fact sheet") elif link_text in ('Show Senate Agendas', 'Show House Agendas'): agenda_type = 'House Agenda' if re.match('House', link_text) else 'Senate Agenda' for tr in root.xpath(div_path)[2:]: # the first row has only a comment # the second row is the table header tds = tr.cssselect('td') if len(tds) >= 8: agenda_committee = tds[0].text_content().strip() agenda_revised = tds[1].text.strip() agenda_cancelled = tds[2].text.strip() agenda_date = tds[3].text_content().strip() agenda_time = tds[4].text_content().strip() agenda_room = tds[5].text_content().strip() agenda_pdf = tds[6].xpath('string(a/@href)').strip() agenda_html = tds[7].xpath('string(a/@href)').strip() bill.add_document(agenda_committee, agenda_html, type=agenda_type) elif link_text in ('Show Senate Calendars', 'Show House Calendar'): cal_type = 'house calendar' if re.match('House', link_text) else 'senate calendar' for tr in root.xpath(div_path)[2:]: # the first row has only a comment # the second row is the table header tds = tr.cssselect('td') if len(tds) >= 6: calendar_name = tds[0].text_content().strip() calendar_number = tds[1].text_content().strip() calendar_modified = True if tds[2].xpath('img') else False calendar_date = tds[3].text_content().strip() calendar_html = tds[5].xpath('string(a/@href)') bill.add_document(calendar_name, calendar_html, type="calendar") elif link_text == 'Show Adopted Amendments': for tr in root.xpath(div_path)[1:]: tds = tr.cssselect('td') amendment_title = tds[1].text_content().strip() amendment_link = tds[2].xpath('string(font/a/@href)') bill.add_document(amendment_title, amendment_link, type='amendment') elif link_text == 'Show Proposed Amendments': for tr in root.xpath(div_path)[1:]: tds = tr.cssselect('td') if len(tds) >= 3: amendment_title = tds[1].text_content().strip() amendment_link = tds[2].xpath('string(font/a/@href)') bill.add_document(amendment_title, amendment_link, type='amendment') elif link_text == 'Show Bill Videos': for tr in root.xpath(div_path)[2:]: tds = tr.cssselect('td') if len(tds) >= 3: video_title = tds[1].text_content().strip() video_link = tds[2].xpath('string(a/@href)') video_date = tds[0].text_content().strip() bill.add_document(video_title, video_link, date=video_date, type='video') # action_url = 'http://www.azleg.gov/FormatDocument.asp?inDoc=/legtext/49leg/2r/bills/hb2001o.asp' # again the actions page may or may not have a given table and the order # of the actions depends on the chamber the bill originated in. ses_num = utils.legislature_to_number(session) action_url = base_url + 'FormatDocument.asp?inDoc=/legtext/%s/bills/%so.asp' % (ses_num, bill_id.lower()) with self.urlopen(action_url) as action_page: bill.add_source(action_url) root = html.fromstring(action_page) action_tables = root.xpath('/html/body/div/table/tr[3]/td[4]/table/tr/td/table/tr/td/table') for table in action_tables: rows = table.cssselect('tr') house = False if chamber == 'upper' else True action = table.cssselect('td')[0].text_content().strip()[:-1] if action == 'SPONSORS': if len(rows[0]) == 4: for row in rows: tds = row.cssselect('td') sponsors = [tds[i:i+2:] for i in range(1, len(tds), 2)] bill.add_sponsor(sponsors[0][1].text_content().strip(), sponsors[0][0].text_content().strip(), sponsor_link=sponsors[0][0].xpath('string(a/@href)')) elif action == 'COMMITTEES': # the html for this table has meta tags that give the chamber # and the committee abreviation # <meta name="HCOMMITTEE" content="RULES"> # question for actions: in the case of committees would House # Rules be better for an actor? for row in rows[1:]: tds = row.cssselect('td') meta_tag = row.cssselect('meta')[0] actor = "%s:%s" % (meta_tag.get('name'), meta_tag.get('content')) committee = meta_tag.get('content') act = 'committee:reffered' date = datetime.datetime.strptime(tds[1].text_content().strip(), '%m/%d/%y') bill.add_action(actor, act, date, type='committee:referred') if len(tds) == 5: if re.match('\d{2}/\d{2}/\d{2}', tds[3].text_content().strip()): date = datetime.datetime.strptime(tds[3].text_content().strip(), '%m/%d/%y') else: date = datetime.datetime.strptime(tds[1].text_content().strip(), '%m/%d/%y') act = tds[4].text_content().strip() status = 'other' bill.add_action(actor, act, date, type=status, status=status) elif len(tds) == 6: where, committee = actor.split(':') where = 'lower' if where == 'HCOMMITTEE' else 'upper' date = datetime.datetime.strptime(tds[3].text_content().strip(), '%m/%d/%y') vote = tds[4].text_content().strip()[1:-1] if len(vote.split('-')) == 4: yes, no, nv, exc = vote.split('-') else: yes, no, excused, absent, nv = vote.split('-') motion = tds[5].text_content().strip() passed = True if yes > no else False vote = Vote(where, date, motion, passed, int(yes), int(no), int(nv), committee=committee) vote.add_source(tds[0].xpath('string(a/@href)').strip()) bill.add_vote(vote) elif action in ('HOUSE FIRST READ', 'HOUSE SECOND READ'): aType = 'other' if re.search('HOUSE FIRST', action): aType = 'committee:referred' bill.add_action('lower', action, utils.get_date(rows[0][1]), type=aType) elif action in ('SENATE FIRST READ', 'SENATE SECOND READ'): aType = 'other' if re.search('SECOND', action): aType = 'committee:referred' bill.add_action('upper', action, utils.get_date(rows[0][1]), type=aType) elif action in ('TRANSMIT TO HOUSE', 'TRANSMIT TO SENATE'): actor = 'lower' if re.match('HOUSE', action) else 'upper' house = True if actor == 'lower' else False date = utils.get_date(rows[0][1]) bill.add_action(actor, action, date) elif re.match('COW ACTION \d', action): actor = 'lower' if house else 'upper' for row in rows[1:]: date = utils.get_date(row[1]) bill.add_action(actor, action, date, motion=row[2].text_content().strip()) elif action in ('HOUSE FINAL READ', 'SENATE FINAL READ', 'THIRD READ'): actor = 'lower' if house else 'upper' for row in rows[1:]: if row[0].text_content().strip() == 'Vote Detail': if len(row.getchildren()) == 10: detail, date, ayes, nays, nv, exc, emer, rfe, two_thirds, result = [ x.text_content().strip() for x in row ] print action_url passed = True if result == 'PASSED' else False motion = action date = datetime.datetime.strptime(date, '%m/%d/%y') if date else '' vote = Vote(actor, date, motion, passed, int(ayes), int(nays), int(nv), excused=int(exc), emergency=emer, rfe=rfe, two_thirds_vote=two_thirds, type="passage") vote.add_source(row[0].xpath('string(a/@href)').strip()) bill.add_vote(vote) elif len(row.getchildren()) == 11: detail, date, ayes, nays, nv, exc, emer, amend, rfe, two_thirds, result = [ x.text_content().strip() for x in row ] passed = True if result == 'PASSED' else False motion = action date = datetime.datetime.strptime(date, '%m/%d/%y') if date else '' vote = Vote(actor, date, motion, passed, int(ayes), int(nays), int(nv), excused=int(exc), emergency=emer, amended=amend, rfe=rfe, two_thirds_vote=two_thirds, type="passage") vote.add_source(row[0].xpath('string(a/@href)').strip()) bill.add_vote(vote) elif action == 'TRANSMITTED TO': actor = 'lower' if house else 'upper' act = action + ": " + rows[0][1].text_content().strip() date = rows[0][2].text_content().strip() date = datetime.datetime.strptime(date, '%m/%d/%y') bill.add_action(actor, act, date, type='governor:received') # need action and chaptered, chaptered version if they exists act, date, chapter, version = '', '', '', '' for row in rows[1:]: if row[0].text_content().strip() == 'ACTION:': act = row[1].text_content().strip() date = datetime.datetime.strptime(row[2].text_content().strip(), '%m/%d/%y') elif row[0].text_content().strip() == 'CHAPTER': chapter = row[1].text_content().strip() elif row[0].text_content().strip() == 'CHAPTERED VERSION': version = row[1].text_content.strip() if act: action_type = 'governor:signed' if act == 'SIGNED' else 'governor:vetoed' if chapter: bill.add_action('governor', act, date, type=action_type, chapter=chapter, chaptered_version=version) else: bill.add_action('governor', act, date, type=action_type) self.save_bill(bill) self.log("saved: " + bill['bill_id'])
def scrape_bills(self, session): url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session with self.urlopen(url) as bill_dir_page: root = lxml.etree.fromstring(bill_dir_page, lxml.etree.HTMLParser()) for mr in root.xpath('//lastaction/msrgroup'): bill_id = mr.xpath('string(measure)').replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" link = mr.xpath('string(actionlink)').replace("..", "") main_doc = mr.xpath('string(measurelink)').replace("../../../", "") main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link) with self.urlopen(bill_details_url) as details_page: details_page = details_page.decode('latin1').encode('utf8', 'ignore') details_root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) title = details_root.xpath('string(//shorttitle)') longtitle = details_root.xpath('string(//longtitle)') bill = Bill(session, chamber, bill_id, title, longtitle = longtitle) #sponsors main_sponsor = details_root.xpath('string(//p_name)').split()[0] main_sponsor_link = details_root.xpath('string(//p_link)').replace(" ", "_") main_sponsor_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link) type = "Primary sponsor" bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url) for author in details_root.xpath('//authors/additional'): leg = author.xpath('string(co_name)').replace(" ", "_") leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg) type = "additional sponsor" bill.add_sponsor(type, leg, leg_url=leg_url) #Versions curr_version = details_root.xpath('string(//current_other)').replace("../../../../", "") curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version bill.add_version("Current version", curr_version_url) intro_version = details_root.xpath('string(//intro_other)').replace("../../../../", "") intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version bill.add_version("As Introduced", intro_version_url) comm_version = details_root.xpath('string(//cmtesub_other)').replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version("Committee Substitute", comm_version_url) passed_version = details_root.xpath('string(//passed_other)').replace("../../../../", "") if passed_version.find("documents") != -1: passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version title = "As Passed the " + chamber bill.add_version(title, passed_version_url) asg_version = details_root.xpath('string(//asg_other)').replace("../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version("Approved by the Governor", asg_version_url) #Actions for action in details_root.xpath('//history/action'): action_num = action.xpath('string(act_number)').strip() action_num = int(action_num) action_desc = action.xpath('string(act_desc)') act_vote = action.xpath('string(act_vote)').replace("../../../..", "") date = action_desc.split()[0] + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") try: actor = action_desc.split()[2][1] if actor == "H": actor = "lower" else: actor = "upper" except: actor = "Executive" action = action_desc[10: len(action_desc)] if action.find("Veto") != -1: version_path = details_root.xpath("string(//veto_other)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document("Veto", version_url) bill.add_action(actor, action, date, action_num=action_num) vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote if vote_url != "http://billstatus.ls.state.ms.us": vote =self.scrape_votes(vote_url, action, date, actor) bill.add_vote(vote) self.save_bill(bill)