def scrape(self, chamber, session): year = year_from_session(session) url = bills_url(year) with self.urlopen(url) as bills_page_html: bills_page = lxml.html.fromstring(bills_page_html) table_rows = bills_page.cssselect('tr') # Eliminate empty rows table_rows = table_rows[0:len(table_rows):2] for row in table_rows: row_elements = row.cssselect('td') bill_document = row_elements[0] bill_document.make_links_absolute(base_url()) element, attribute, link, pos = bill_document.iterlinks().next() bill_id = element.text_content().rstrip('.pdf') bill_document_link = link title_and_sponsors = row_elements[1] title_match = re.search('([A-Z][a-z]+.+[a-z])[A-Z]', title_and_sponsors.text_content()) sponsors_match = re.search('[a-z]([A-Z]+.+)', title_and_sponsors.text_content()) title = title_match.group(1) sponsors = sponsors_match.group(1) separated_sponsors = sponsors.split('--') bill = Bill(session, chamber, bill_id, title) bill.add_version('current', bill_document_link) if separated_sponsors[1] == '(NONE)': bill.add_sponsor('primary', separated_sponsors[0]) else: bill.add_sponsor('cosponsor', separated_sponsors[0]) bill.add_sponsor('cosponsor', separated_sponsors[1]) versions_page_element = row_elements[2] versions_page_element.make_links_absolute(base_url()) element, attribute, link, pos = versions_page_element.iterlinks().next() bill.add_source(link) self.scrape_versions(link, bill) actions_page_element = row_elements[3] element, attribute, link, pos = actions_page_element.iterlinks().next() frame_link = base_url() + link.split('?Open&target=')[1] self.scrape_actions(frame_link, bill) votes_page_element = row_elements[7] element, attribute, link, pos = votes_page_element.iterlinks().next() frame_link = base_url() + link.split('?Open&target=')[1] self.scrape_votes(frame_link, chamber, bill)
def scrape2003(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm" with self.lxml_context(url) as page: # Grab the interesting tables on the page. tables = page.cssselect('center table') # Bill name = tables[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Sponsorships for a in tables[1].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions center = page.cssselect('center table center')[0] for row in center.cssselect('table')[-2].cssselect('tr')[2:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue if action_text.startswith('Senate'): bill.add_action('upper', action_text, date) elif action_text.startswith('House'): bill.add_action('lower', action_text, date) # Versions for row in center.cssselect('table')[-1].cssselect('a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href'))) self.save_bill(bill)
def parse_senate_billpage(self, bill_url, year): with self.urlopen(bill_url) as bill_page: bill_page = BeautifulSoup(bill_page) # get all the info needed to record the bill bill_id = bill_page.find(id="lblBillNum").b.font.contents[0] bill_title = bill_page.find(id="lblBillTitle").font.string bill_desc = bill_page.find(id="lblBriefDesc").font.contents[0] bill_lr = bill_page.find(id="lblLRNum").font.string bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url, bill_lr=bill_lr, official_title=bill_title) bill.add_source(bill_url) # Get the primary sponsor bill_sponsor = bill_page.find(id="hlSponsor").i.font.contents[0] bill_sponsor_link = bill_page.find(id="hlSponsor").href bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.find(id="hlCoSponsors") if cosponsor_tag and 'href' in cosponsor_tag: self.parse_senate_cosponsors(bill, cosponsor_tag['href']) # get the actions action_url = bill_page.find(id="hlAllActions")['href'] self.parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.find(id="hlFullBillText") if versions_url: self.parse_senate_bill_versions(bill, versions_url['href']) self.save_bill(bill)
def scrape2009(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm" with self.lxml_context(url) as page: # Bill name = page.cssselect('#legislation h1')[0].text_content().strip() bill_id = name.split(' - ')[0].strip() bill = Bill(session, chamberName, bill_id, name) # Sponsorships for a in page.cssselect("#sponsors a"): bill.add_sponsor('', a.text_content().strip()) # Actions for row in page.cssselect('#history tr')[1:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue date = datetime.datetime.strptime(date, '%m/%d/%Y') if action_text.startswith('Senate'): bill.add_action('upper', action_text, date) elif action_text.startswith('House'): bill.add_action('lower', action_text, date) # Versions for row in page.cssselect('#versions a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href'))) self.save_bill(bill)
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': other_chamber = 'lower' bill_id = 'SB 1' else: other_chamber = 'upper' bill_id = 'HB 1' b1 = Bill(session, chamber, bill_id, 'A super bill') b1.add_source('http://example.com/') b1.add_version('As Introduced', 'http://example.com/SB1.html') b1.add_document('Google', 'http://google.com') b1.add_sponsor('primary', 'Bob Smith') b1.add_sponsor('secondary', 'Johnson, Sally') d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y') v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0) v1.yes('Smith') v1.yes('Johnson') d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y') v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1) v2.no('Bob Smith') v2.other('S. Johnson') b1.add_vote(v1) b1.add_vote(v2) b1.add_action(chamber, 'introduced', d1) b1.add_action(chamber, 'read first time', d2) b1.add_action(other_chamber, 'introduced', d2) self.save_bill(b1)
def scrape1999(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm" with self.lxml_context(url) as lxml: # Grab the interesting tables on the page. tables = page.cssselect('table') # Bill name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/')) # Sponsorships for a in tables[2].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions for row in tables[-1].cssselect('tr'): senate_date = row[0].text_content().strip() action_text = row[1].text_content().strip() house_date = row[2].text_content().strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) header = page.xpath('//h3/br')[0].tail.replace(' ', ' ') title, primary_sponsor = header.split(' -- ') if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'): bill_type = ['bill'] elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'): bill_type = ['concurrent resolution'] elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'): bill_type = ['joint resolution'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_sponsor('primary', primary_sponsor) bill.add_source(url) status_link = page.xpath('//a[contains(@href, "billsta")]')[0] self.parse_status(bill, status_link.attrib['href']) for link in page.xpath( '//a[contains(@href, "bills/") and text() = "HTML"]'): name = link.getprevious().tail.strip() bill.add_version(name, link.attrib['href']) self.save_bill(bill)
def scrape_bill(self, chamber, session, url): page = self.urlopen(url) root = lxml.html.fromstring(page) bill_detail_el = root.xpath('//div[@class="col2"]//div[@class="Columns bg2717"]//div[@class="widgetContent"]')[0] title = bill_detail_el.xpath('.//p/text()')[0] bill_id = bill_detail_el.xpath('./p/b/text()')[1].strip() m = re.search('Bill Number: ([HSD0-9]+)', bill_id) if len(m.groups()): bill_id = m.groups()[0] else: bill_id = None doctype = None if self._last_doctype: doctype = self._last_doctype.lower() bill = Bill(session, chamber, bill_id, title, type=doctype) sponsors_el = bill_detail_el.xpath('./p[2]/a/text()') for i in range(len(sponsors_el)): sponsor = sponsors_el[i] if i == 0: type = 'primary' else: type = 'cosponsor' bill.add_sponsor(type, sponsor) secondary_sponsors_el = bill_detail_el.xpath('.//div[@class="dataBlock"]//td/a/text()') for secondary_sponsor in secondary_sponsors_el: bill.add_sponsor('secondary', secondary_sponsor) print bill self.save_bill(bill)
def scrape(self, chamber, year): # Data prior to 1997 is contained in pdfs if year < "1997": raise NoDataForYear(year) bills_url = "http://www.leg.state.co.us/CLICS/CLICS" + year + "A/csl.nsf/%28bf-1%29?OpenView&Count=2000" with self.lxml_context(bills_url) as bills_page: table_rows = bills_page.cssselect("tr") # Eliminate empty rows table_rows = table_rows[0 : len(table_rows) : 2] for row in table_rows: print "row" row_elements = row.cssselect("td") bill_document = row_elements[0] bill_document.make_links_absolute("http://www.leg.state.co.us") element, attribute, link, pos = bill_document.iterlinks().next() bill_id = element.text_content().rstrip(".pdf") bill_document_link = link title_and_sponsors = row_elements[1] title_match = re.search("([A-Z][a-z]+.+[a-z])[A-Z]", title_and_sponsors.text_content()) sponsors_match = re.search("[a-z]([A-Z]+.+)", title_and_sponsors.text_content()) title = title_match.group(1) sponsors = sponsors_match.group(1) separated_sponsors = sponsors.split("--") bill = Bill(year, chamber, bill_id, title) bill.add_version("current", bill_document_link) if separated_sponsors[1] == "(NONE)": bill.add_sponsor("primary", separated_sponsors[0]) else: bill.add_sponsor("cosponsor", separated_sponsors[0]) bill.add_sponsor("cosponsor", separated_sponsors[1]) versions_page_element = row_elements[2] versions_page_element.make_links_absolute("http://www.leg.state.co.us") element, attribute, link, pos = versions_page_element.iterlinks().next() bill.add_source(link) self.scrape_versions(link, bill) actions_page_element = row_elements[3] element, attribute, link, pos = actions_page_element.iterlinks().next() frame_link = "http://www.leg.state.co.us" + link.split("?Open&target=")[1] self.scrape_actions(frame_link, bill) votes_page_element = row_elements[7] element, attribute, link, pos = votes_page_element.iterlinks().next() frame_link = "http://www.leg.state.co.us" + link.split("?Open&target=")[1] self.scrape_votes(link, chamber, bill)
def scrape_year(self, year, chamber): sep = '<h1>House</h1>' if chamber == 'upper': after = False reg = '[5-9]' else: after = True reg = '[1-4]' with self.lxml_context("http://apps.leg.wa.gov/billinfo/dailystatus.aspx?year=" + str(year), sep, after) as page: for element, attribute, link, pos in page.iterlinks(): if re.search("bill=" + reg + "[0-9]{3}", link) != None: bill_page_url = "http://apps.leg.wa.gov/billinfo/" + link with self.lxml_context(bill_page_url) as bill_page: raw_title = bill_page.cssselect('title') split_title = string.split(raw_title[0].text_content(), ' ') bill_id = split_title[0] + ' ' + split_title[1] bill_id = bill_id.strip() session = split_title[3].strip() title_element = bill_page.get_element_by_id("ctl00_ContentPlaceHolder1_lblSubTitle") title = title_element.text_content() bill = Bill(session, chamber, bill_id, title) bill.add_source(bill_page_url) self.scrape_actions(bill_page, bill) for element, attribute, link, pos in bill_page.iterlinks(): if re.search("billdocs", link) != None: if re.search("Amendments", link) != None: bill.add_document("Amendment: " + element.text_content(), link) elif re.search("Bills", link) != None: bill.add_version(element.text_content(), link) else: bill.add_document(element.text_content(), link) elif re.search("senators|representatives", link) != None: with self.lxml_context(link) as senator_page: try: name_tuple = self.scrape_legislator_name(senator_page) bill.add_sponsor('primary', name_tuple[0]) except: pass elif re.search("ShowRollCall", link) != None: match = re.search("([0-9]+,[0-9]+)", link) match = match.group(0) match = match.split(',') id1 = match[0] id2 = match[1] url = "http://flooractivityext.leg.wa.gov/rollcall.aspx?id=" + id1 + "&bienId=" +id2 with self.lxml_context(url) as vote_page: self.scrape_votes(vote_page, bill, url) self.save_bill(bill)
def scrape(self, chamber, session): self.validate_session(session) if chamber == "lower": bill_abbr = "HB" else: bill_abbr = "SB" bill_list_url = "http://www.le.state.ut.us/~%s/bills.htm" % ( session.replace(' ', '')) self.log("Getting bill list for %s, %s" % (session, chamber)) try: base_bill_list = self.soup_parser(self.urlopen(bill_list_url)) except: # this session doesn't exist for this year return bill_list_link_re = re.compile('.*%s\d+ht.htm$' % bill_abbr) for link in base_bill_list.findAll('a', href=bill_list_link_re): bill_list = self.soup_parser(self.urlopen(link['href'])) bill_link_re = re.compile('.*billhtm/%s.*.htm' % bill_abbr) for bill_link in bill_list.findAll('a', href=bill_link_re): bill_id = bill_link.find(text=True).strip() bill_info_url = bill_link['href'] bill_info = self.soup_parser(self.urlopen(bill_info_url)) bill_title, primary_sponsor = bill_info.h3.contents[2].replace( ' ', ' ').strip().split(' -- ') bill = Bill(session, chamber, bill_id, bill_title) bill.add_source(bill_info_url) bill.add_sponsor('primary', primary_sponsor) status_re = re.compile('.*billsta/%s.*.htm' % bill_abbr.lower()) status_link = bill_info.find('a', href=status_re) if status_link: self.parse_status(bill, status_link['href']) text_find = bill_info.find( text="Bill Text (If you are having trouble viewing") if text_find: text_link_re = re.compile('.*\.htm') for text_link in text_find.parent.parent.findAll( 'a', href=text_link_re)[1:]: version_name = text_link.previous.strip() bill.add_version(version_name, text_link['href']) self.save_bill(bill)
def scrape_bill(self, chamber, session, billid, histurl, year): if year[0] != 'R': session = year else: session = self.metadata['session_details'][year][ 'sub_sessions'][int(year[0]) - 1] with self.urlopen(histurl) as data: soup = BeautifulSoup(cleansource(data)) basicinfo = soup.findAll('div', id='bhistleft')[0] hist = basicinfo.table sponsor = None title = None for b in basicinfo.findAll('b'): if b.next.startswith('SUMMARY'): title = b.findNextSiblings(text=True)[0].strip() elif b.next.startswith('SPONSOR'): for a in b.findNextSiblings('a'): if not issponsorlink(a): break sponsor = cleansponsor(a.contents[0]) bill = Bill(session, chamber, billid, title) if sponsor: bill.add_sponsor('primary', sponsor) for row in hist.findAll('tr'): link = row.td.a vlink = urlbase % link['href'] vname = link.contents[0].strip() bill.add_version(vname, vlink) history = soup.findAll('div', id='bhisttab')[0].table rows = history.findAll('tr')[1:] for row in rows: tds = row.findAll('td') if len(tds) < 2: # This is not actually an action continue date, action = row.findAll('td')[:2] date = dt.datetime.strptime(date.contents[0], '%m/%d/%y') action = action.contents[0].strip() if 'House' in action: actor = 'lower' elif 'Senate' in action: actor = 'upper' else: # for lack of a better actor = chamber bill.add_action(actor, action, date) self.save_bill(bill)
def parse_bill(self, chamber, session, bill_id, bill_info_url): with self.urlopen(bill_info_url) as bill_info_data: bill_info = self.soup_parser(bill_info_data) version_url = '%s/bill.doc' % bill_id version_link = bill_info.find(href=version_url) if not version_link: # This bill was withdrawn return bill_title = version_link.findNext('p').contents[0].strip() bill = Bill(session, chamber, bill_id, bill_title) bill.add_version("Most Recent Version", session_url(session) + version_url) bill.add_source(bill_info_url) sponsor_links = bill_info.findAll(href=re.compile( 'legislator/[SH]\d+\.htm')) for sponsor_link in sponsor_links: bill.add_sponsor('primary', sponsor_link.contents[0].strip()) action_p = version_link.findAllNext('p')[-1] for action in action_p.findAll(text=True): action = action.strip() if (not action or action == 'last action' or 'Prefiled' in action): continue action_date = action.split('-')[0] action_date = dt.datetime.strptime(action_date, '%b %d') # Fix: action_date = action_date.replace( year=int('20' + session[2:4])) action = '-'.join(action.split('-')[1:]) if action.endswith('House') or action.endswith('(H)'): actor = 'lower' elif action.endswith('Senate') or action.endswith('(S)'): actor = 'upper' else: actor = chamber bill.add_action(actor, action, action_date) vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf')) if vote_link: bill.add_document( 'vote_history.pdf', bill_info_url.replace('.htm', '') + "/vote_history.pdf") self.save_bill(bill)
def scrape(self, chamber, session): self.site_id = self.metadata['session_details'][session]['internal_id'] chamber_piece = {'upper': 'Senate', 'lower': 'House+of+Representatives'}[chamber] # resolutions # http://alisondb.legislature.state.al.us/acas/SESSResosBySelectedMatterTransResults.asp?WhichResos=Senate&TransCodes={All}&LegDay={All}%22&GetBillsTrans=Get+Resolutions+by+Transaction url = 'http://alisondb.legislature.state.al.us/acas/SESSBillsBySelectedMatterTransResults.asp?TransCodes={All}&LegDay={All}&WhichBills=%s' % chamber_piece self.refresh_session() with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # bills are all their own table with cellspacing=4 (skip first) bill_tables = doc.xpath('//table[@cellspacing="4"]') for bt in bill_tables[1:]: # each table has 3 rows: detail row, description, blank details, desc, _ = bt.xpath('tr') # first <tr> has img, button, sponsor, topic, current house # current status, committee, committee2, last action _, button, sponsor, topic, _, _, com1, com2, _ = details.xpath('td') # pull bill_id out of script tag (gross) bill_id = bill_id_re.search(button.text_content()).group() oid = btn_re.search(button.text_content()).groups()[0] sponsor = sponsor.text_content() topic = topic.text_content() com1 = com1.text_content() com2 = com2.text_content() desc = desc.text_content() # create bill bill = Bill(session, chamber, bill_id, desc.strip(), topic=topic) bill.add_sponsor(sponsor, 'primary') self.get_sponsors(bill, oid) self.get_actions(bill, oid) # craft bill URL session_fragment = '2010rs' type_fragment = 'bills' bill_id_fragment = bill_id.lower() bill_text_url = 'http://alisondb.legislature.state.al.us/acas/searchableinstruments/%s/%s/%s.htm' % ( session_fragment, type_fragment, bill_id_fragment) bill.add_version('bill text', bill_text_url) self.save_bill(bill)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ if chamber == "House": chamber = 'lower' else: chamber = 'upper' with self.urlopen(bill_detail_url) as bill_html: bill_soup = BeautifulSoup(bill_html) bill_id = self.extract_bill_id(bill_soup) bill_title = self.extract_bill_title(bill_soup) bill = Bill(session, chamber, bill_id, bill_title) # Get all versions of the bill. # Versions of a bill are on a separate page, linked to from the column # labeled, "Bill Text", on the search results page. with self.urlopen(version_list_url) as version_html: version_soup = BeautifulSoup(version_html) # MN bills can have multiple versions. Get them all, and loop over # the results, adding each one. self.debug("Extracting bill versions from: " + version_list_url) bill_versions = self.extract_bill_versions(version_soup) for version in bill_versions: version_name = version['name'] version_url = urlparse.urljoin(VERSION_URL_BASE, version['url']) bill.add_version(version_name, version_url) # grab primary and cosponsors # MN uses "Primary Author" to name a bill's primary sponsor. # Everyone else listed will be added as a 'cosponsor'. sponsors = self.extract_bill_sponsors(bill_soup) primary_sponsor = sponsors[0] cosponsors = sponsors[1:] bill.add_sponsor('primary', primary_sponsor) for leg in cosponsors: bill.add_sponsor('cosponsor', leg) # Add Actions performed on the bill. bill_actions = self.extract_bill_actions(bill_soup, chamber) for action in bill_actions: action_chamber = action['action_chamber'] action_date = action['action_date'] action_text = action['action_text'] bill.add_action(action_chamber, action_text, action_date) self.save_bill(bill)
def scrape1995(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm" with self.lxml_context(url) as page: # Bill name = page.cssselect('h3 br')[0].tail.split('-', 1)[1].strip() bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/')) # Sponsorships rows = page.cssselect('center table tr') for row in rows: if row.text_content().strip() == 'Sponsor and CoSponsors': continue if row.text_content().strip() == 'Links / Committees / Status': break for a in row.cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions # The actions are in a pre table that looks like: """ SENATE HOUSE ------------------------------------- 1/13/95 Read 1st time 2/6/95 1/31/95 Favorably Reported 2/1/95 Read 2nd Time 2/7/95 2/3/95 Read 3rd Time 2/3/95 Passed/Adopted """ actions = page.cssselect('pre')[0].text_content().split('\n') actions = actions[2:] for action in actions: senate_date = action[:22].strip() action_text = action[23:46].strip() house_date = action[46:].strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.save_bill(bill)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ if chamber == "House": chamber = 'lower' else: chamber = 'upper' with self.urlopen(bill_detail_url) as bill_html: doc = lxml.html.fromstring(bill_html) bill_id = doc.xpath('//title/text()')[0].split()[0] bill_title = doc.xpath('//font[@size=-1]/text()')[0] bill_type = {'F': 'bill', 'R':'resolution', 'C': 'concurrent resolution'}[bill_id[1]] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) bill.add_source(bill_detail_url) # grab sponsors sponsors = doc.xpath('//table[@summary="Show Authors"]/descendant::a/text()') if sponsors: primary_sponsor = sponsors[0].strip() bill.add_sponsor('primary', primary_sponsor) cosponsors = sponsors[1:] for leg in cosponsors: bill.add_sponsor('cosponsor', leg.strip()) # Add Actions performed on the bill. bill_actions = self.extract_bill_actions(doc, chamber) for action in bill_actions: bill.add_action(action['action_chamber'], action['action_text'], action['action_date'], type=action['action_type']) # Get all versions of the bill. # Versions of a bill are on a separate page, linked to from the column # labeled, "Bill Text", on the search results page. with self.urlopen(version_list_url) as version_html: version_doc = lxml.html.fromstring(version_html) for v in version_doc.xpath('//a[starts-with(@href, "/bin/getbill.php")]'): version_url = urlparse.urljoin(VERSION_URL_BASE, v.get('href')) bill.add_version(v.text.strip(), version_url) self.save_bill(bill)
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': bill_no = 1 abbr = 'SB' else: bill_no = 4001 abbr = 'HB' while True: bill_page = self.scrape_bill(session, abbr, bill_no) bill_page = BeautifulSoup(bill_page) # if we can't find a page, we must be done. This is a healthy thing. if bill_page == None: return title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0])) title = title.replace('\n','').replace('\r','') bill_id = "%s %d" % (abbr, bill_no) the_bill = Bill(session, chamber, bill_id, title) #sponsors first = 0 for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'): the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string) first = 1 #versions for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_version(*r) #documents if 'frg_billstatus_HlaTable' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) if 'frg_billstatus_SfaSection' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0]) self.save_bill(the_bill) bill_no = bill_no + 1 pass
def scrape_assem_bills(self, chamber, insert, session, year): doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution', 6: 'joint resolution'} for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) with self.urlopen(page_path) as page: page = page.decode("utf8").replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)') if insert.find('Special') != -1: session = insert bill = Bill(session, chamber, bill_id, title, type=bill_type) bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url) primary, secondary = self.scrape_sponsors(page) if primary[0] == 'By:': primary.pop(0) if primary[0] == 'ElectionsProceduresEthicsand': primary[0] = 'Elections Procedures Ethics and' full_name = '' for part_name in primary: full_name = full_name + part_name + " " bill.add_sponsor('primary', full_name) else: for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "lower") self.scrape_votes(page, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape_info(self, session, bill_number): bill_view_url = 'http://www.njleg.state.nj.us/bills/BillView.asp' bill_id = bill_number[0] bill_view_body = 'BillNumber=%s++++&LastSession=' % bill_number[0] with self.urlopen(bill_view_url, 'POST', bill_view_body) as bill_view_page: root = lxml.etree.fromstring(bill_view_page, lxml.etree.HTMLParser()) title = bill_number[1] if bill_id[0] == 'A': chamber = 'General Assembly' elif bill_number[0][0] == 'S': chamber = 'Senate' bill = Bill(session, chamber, bill_id, title) #Grabbing sponsors sponsorship = root.xpath('string(//tr[1]/td[1]/div/font[3])').split() primary_count = sponsorship.count('Primary') sponsor_count = 1 #Special case if session == 214 and bill_id == 'A101': sponsorship = root.xpath('string(//tr[1]/td[1]/div/font[5])').split() primary_count = sponsorship.count('Primary') for sp in root.xpath('//tr[1]/td[1]/div/font/a/font'): sponsor = sp.xpath('string()').split() if len(sponsor) == 3: leg = sponsor[1] + " " + sponsor[2] + " " + sponsor[0] leg = leg[0: len(leg) - 1] elif len(sponsor) == 2: leg = sponsor[1] + " " + sponsor[0] leg = leg[0: len(leg) - 1] if sponsor_count <= primary_count: sponsor_type = 'Primary' if sponsor_count > primary_count: sponsor_type = 'Co-sponsor' bill.add_sponsor(sponsor_type, leg) sponsor_count = sponsor_count + 1 self.save_bill(bill)
def parse_bill(scraper, url): """Given a bill status URL, return a fully loaded Bill object, except for votes, which are expected to be handled externally. """ session = extract_session(url) chamber = chamber_for_doctype(extract_doctype(url)) s = get_soup(scraper, url) bill_id = extract_bill_id(s) landmark = s(text=re.compile(".*Short Description.*")) name_span = landmark[0].findParent().findNextSibling() bill_name = get_text(name_span) bill = Bill(session, chamber, bill_id, bill_name.strip(),status_url=url) actions = extract_actions(s) for chamber,action,date in actions: bill.add_action(chamber,action,date) #kwargs are permitted if we have 'em. sponsor_dict = extract_sponsors_from_actions([action[1] for action in actions]) for type,namelist in sponsor_dict.iteritems(): for name in namelist: bill.add_sponsor(type,name) for name,link in extract_versions(scraper, s): bill.add_version(name,link) return bill
def scrape_bill(self, chamber, session, bill_number, ga_num): bill_url = self.urls['info'] % (bill_number, ga_num) with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) title = page.xpath("//span[@id='lblAbstract']")[0].text bill = Bill(session, chamber, bill_number, title) bill.add_source(bill_url) # Primary Sponsor sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1] sponsor = sponsor.replace('*','').strip() bill.add_sponsor('primary',sponsor) # Co-sponsors unavailable for scraping (loaded into page via AJAX) # Full summary doc summary = page.xpath("//span[@id='lblBillSponsor']/a")[0] bill.add_document('Full summary', summary.get('href')) # Actions tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']") actions_table = tables[0] action_rows = actions_table.xpath("tr[position()>1]") for ar in action_rows: action_taken = ar.xpath("td")[0].text action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y') bill.add_action(chamber, action_taken, action_date) votes_link = page.xpath("//span[@id='lblBillVotes']/a") if(len(votes_link) > 0): votes_link = votes_link[0].get('href') bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,)) self.save_bill(bill)
def scrape_assem_bills(self, chamber, insert, session): doc_type = [1, 3, 5, 6] for doc in doc_type: parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, doc) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) with self.urlopen(page_path) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)') bill = Bill(session, chamber, bill_id, title) primary, secondary = self.scrape_sponsors(page_path) if primary[0] == 'By:': primary.pop(0) if primary[0] == 'ElectionsProceduresEthicsand': primary[0] = 'Elections Procedures Ethics and' full_name = '' for part_name in primary: full_name = full_name + part_name + " " bill.add_sponsor('primary', full_name) else: for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) self.scrape_actions(page_path, bill, "Assembly") self.scrape_votes(page_path, bill, "Assembly", insert, title) bill.add_source(page_path) self.save_bill(bill)
def parse_bill_xml(self, chamber, session, txt): root = lxml.etree.fromstring(txt) bill_id = " ".join(root.attrib["bill"].split(" ")[1:]) bill_title = root.findtext("caption") if session[2] == "R": session = session[0:2] if bill_id[1] == "B": bill_type = ["bill"] elif bill_id[1] == "R": bill_type = ["resolution"] elif bill_id[1:3] == "CR": bill_type = ["concurrent resolution"] elif bill_id[1:3] == "JR": bill_type = ["joint resolution"] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) for action in root.findall("actions/action"): act_date = dt.datetime.strptime(action.findtext("date"), "%m/%d/%Y").date() extra = {} extra["action_number"] = action.find("actionNumber").text comment = action.find("comment") if comment is not None and comment.text: extra["comment"] = comment.text.strip() actor = {"H": "lower", "S": "upper", "E": "executive"}[extra["action_number"][0]] desc = action.findtext("description").strip() if desc == "Amended": type = "amendment:passed" elif desc == "Amendment(s) offered": type = "amendment:introduced" elif desc == "Amendment amended": type = "amendment:amended" elif desc == "Amendment withdrawn": type = "amendment:withdrawn" elif desc.startswith("Received by the Secretary of"): type = "bill:introduced" elif desc == "Passed": type = "bill:passed" elif desc.startswith("Received from the"): type = "bill:introduced" elif desc.startswith("Signed by the Governor"): type = "governor:signed" elif desc == "Filed": type = "bill:introduced" else: type = "other" bill.add_action(actor, action.findtext("description"), act_date, type=type, **extra) for author in root.findtext("authors").split(" | "): if author != "": bill.add_sponsor("author", author) for coauthor in root.findtext("coauthors").split(" | "): if coauthor != "": bill.add_sponsor("coauthor", coauthor) for sponsor in root.findtext("sponsors").split(" | "): if sponsor != "": bill.add_sponsor("sponsor", sponsor) for cosponsor in root.findtext("cosponsors").split(" | "): if cosponsor != "": bill.add_sponsor("cosponsor", cosponsor) bill["subjects"] = [] for subject in root.iterfind("subjects/subject"): bill["subjects"].append(subject.text.strip()) return bill
def scrape(self, chamber, year): session = "%s%d" % (year, int(year) + 1) if session not in [s_ for t in metadata['terms'] for s_ in t['sessions']]: raise NoDataForPeriod(year) if chamber == 'upper': measure_abbr = 'SB' chamber_name = 'SENATE' house_type = 'S' else: measure_abbr = 'AB' chamber_name = 'ASSEMBLY' house_type = 'A' bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=measure_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id version = self.session.query(CABillVersion).filter_by( bill=bill).filter(CABillVersion.bill_xml != None).first() if not version: # not enough data to import continue fsbill = Bill(bill_session, chamber, bill_id, version.title, short_title=version.short_title) for author in version.authors: if author.house == chamber_name: fsbill.add_sponsor(author.contribution, author.name) for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: actor = re.sub('^Assembly', 'lower', actor) actor = re.sub('^Senate', 'upper', actor) type = [] act_str = action.action if act_str.startswith('Introduced'): type.append('bill:introduced') if 'To Com' in act_str: type.append('committee:referred') if 'Read third time. Passed.' in act_str: type.append('bill:passed') if 'Approved by Governor' in act_str: type.append('bill:signed') if 'Item veto' in act_str: type.append('veto:line-item') if not type: type = ['other'] fsbill.add_action(actor, act_str, action.action_date, type=type) for vote in bill.votes: if vote.vote_result == '(PASS)': result = True else: result = False full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' vote_location = ' '.join(full_loc.split(' ')[1:]) else: vote_chamber = '' vote_location = full_loc fsvote = Vote(vote_chamber, vote.vote_date_time, vote.motion.motion_text or '', result, vote.ayes, vote.noes, vote.abstain, threshold=vote.threshold, location=vote_location) for record in vote.votes: if record.vote_code == 'AYE': fsvote.yes(record.legislator_name) elif record.vote_code.startswith('NO'): fsvote.no(record.legislator_name) else: fsvote.other(record.legislator_name) fsbill.add_vote(fsvote) self.save_bill(fsbill)
def parse_bill_xml(self, chamber, session, txt): root = lxml.etree.fromstring(txt) bill_id = ' '.join(root.attrib['bill'].split(' ')[1:]) bill_title = root.findtext("caption") if session[2] == 'R': session = session[0:2] bill = Bill(session, chamber, bill_id, bill_title) for action in root.findall('actions/action'): act_date = dt.datetime.strptime(action.findtext('date'), "%m/%d/%Y") extra = {} extra['action_number'] = action.find('actionNumber').text comment = action.find('comment') if comment is not None and comment.text: extra['comment'] = comment.text.strip() actor = {'H': 'lower', 'S': 'upper', 'E': 'executive'}[extra['action_number'][0]] desc = action.findtext('description').strip() if desc == 'Amended': type = 'amendment:passed' elif desc == 'Amendment(s) offered': type = 'amendment:introduced' elif desc == 'Amendment amended': type = 'amendment:amended' elif desc == 'Amendment withdrawn': type = 'amendment:withdrawn' elif desc.startswith('Received by the Secretary of'): type = 'bill:introduced' elif desc == 'Passed': type = 'bill:passed' elif desc.startswith('Received from the'): type = 'bill:introduced' elif desc.startswith('Signed by the Governor'): type = 'bill:signed' elif desc == 'Filed': type = 'bill:introduced' else: type = 'other' bill.add_action(actor, action.findtext('description'), act_date, type=type, **extra) for author in root.findtext('authors').split(' | '): if author != "": bill.add_sponsor('author', author) for coauthor in root.findtext('coauthors').split(' | '): if coauthor != "": bill.add_sponsor('coauthor', coauthor) for sponsor in root.findtext('sponsors').split(' | '): if sponsor != "": bill.add_sponsor('sponsor', sponsor) for cosponsor in root.findtext('cosponsors').split(' | '): if cosponsor != "": bill.add_sponsor('cosponsor', cosponsor) bill['subjects'] = [] for subject in root.iterfind('subjects/subject'): bill['subjects'].append(subject.text.strip()) return bill
def scrape_session_new(self, chamber, session): if chamber == "lower": bill_abbr = "H." else: bill_abbr = "S." bill_list_path = "docs/bills.cfm?Session=%s&Body=%s" % ( session.split('-')[1], bill_abbr[0]) bill_list_url = "http://www.leg.state.vt.us/" + bill_list_path bill_list = BeautifulSoup(self.urlopen(bill_list_url)) bill_link_re = re.compile('.*?Bill=%s\.\d+.*' % bill_abbr[0]) for bill_link in bill_list.findAll('a', href=bill_link_re): bill_id = bill_link.string bill_title = bill_link.parent.findNext('b').string bill_info_url = "http://www.leg.state.vt.us" + bill_link['href'] bill = Bill(session, chamber, bill_id, bill_title) bill.add_source(bill_info_url) info_page = BeautifulSoup(self.urlopen(bill_info_url)) text_links = info_page.findAll('blockquote')[1].findAll('a') for text_link in text_links: bill.add_version(text_link.string, "http://www.leg.state.vt.us" + text_link['href']) act_table = info_page.findAll('blockquote')[2].table for row in act_table.findAll('tr')[1:]: action = "" for s in row.findAll('td')[1].findAll(text=True): action += s + " " action = clean_action(action) match = re.search('Governor on (.*)$', action) if match: act_date = parse_exec_date(match.group(1).strip()) actor = 'Governor' else: if row['bgcolor'] == 'Salmon': actor = 'lower' else: actor = 'upper' if row.td.a: act_date = row.td.a.string else: act_date = row.td.string try: act_date = re.search( '\d{1,2}/\d{1,2}/\d{4,4}', act_date).group(0) except AttributeError: # No date, skip continue act_date = dt.datetime.strptime(act_date, '%m/%d/%Y') bill.add_action(actor, action, act_date, type=action_type(action)) vote_link = row.find('a', text='Details') if vote_link: vote_url = vote_link.parent['href'] self.parse_vote_new(bill, actor, vote_url) sponsors = info_page.find( text='Sponsor(s):').parent.parent.findAll('b') bill.add_sponsor('primary', sponsors[0].string) for sponsor in sponsors[1:]: bill.add_sponsor('cosponsor', sponsor.string) self.save_bill(bill)
def scrape_session_old(self, chamber, session): if chamber == "lower": bill_abbr = "H." chamber_name = "House" other_chamber = "Senate" else: bill_abbr = "S." chamber_name = "Senate" other_chamber = "House" start_date = '1/1/%s' % session.split('-')[0] data = urllib.urlencode({'Date': start_date, 'Body': bill_abbr[0], 'Session': session.split('-')[1]}) bill_list_url = "http://www.leg.state.vt.us/database/"\ "rintro/results.cfm" bill_list = BeautifulSoup(urllib2.urlopen(bill_list_url, data)) bill_link_re = re.compile('.*?Bill=%s.\d+.*' % bill_abbr[0]) for bill_link in bill_list.findAll('a', href=bill_link_re): bill_id = bill_link.string bill_title = bill_link.parent.parent.findAll('td')[1].string bill_info_url = "http://www.leg.state.vt.us" + bill_link['href'] bill = Bill(session, chamber, bill_id, bill_title) bill.add_source(bill_info_url) info_page = BeautifulSoup(self.urlopen(bill_info_url)) text_links = info_page.findAll('blockquote')[-1].findAll('a') for text_link in text_links: bill.add_version(text_link.string, "http://www.leg.state.vt.us" + text_link['href']) sponsors = info_page.find( text='Sponsor(s):').parent.findNext('td').findAll('b') bill.add_sponsor('primary', sponsors[0].string) for sponsor in sponsors[1:]: bill.add_sponsor('cosponsor', sponsor.string) # Grab actions from the originating chamber act_table = info_page.find( text='%s Status:' % chamber_name).findNext('table') for row in act_table.findAll('tr')[3:]: action = clean_action(row.td.string.replace( ' ', '').strip(':')) act_date = row.findAll('td')[1].b.string.replace(' ', '') if act_date != "": detail = row.findAll('td')[2].b if detail and detail.string != "": action += ": %s" % detail.string.replace(' ', '') bill.add_action(chamber, action, act_date, type=action_type(action)) # Grab actions from the other chamber act_table = info_page.find( text='%s Status:' % other_chamber).findNext('table') if act_table: if chamber == 'upper': act_chamber = 'lower' else: act_chamber = 'upper' for row in act_table.findAll('tr')[3:]: action = clean_action(row.td.string.replace( ' ', '').strip(':')) act_date = row.findAll('td')[1].b.string.replace( ' ', '') if act_date != "": detail = row.findAll('td')[2].b if detail and detail.string != "": action += ": %s" % detail.string.replace( ' ', '') date = dt.datetime.strptime(act_date, '%m/%d/%Y') bill.add_action(act_chamber, action, act_date, type=action_type(action)) self.save_bill(bill)
def scrape_session(self, chamber, year): if chamber == 'upper': bill_abbr = 'SB|SCR|SJR' elif chamber == 'lower': bill_abbr = 'HB|HCR|HJR' # Sessions last 2 years, 1993-1994 was the 18th session = str(18 + ((int(year) - 1993) / 2)) year2 = str(int(year) + 1) # Full calendar year date1 = '0101' + year[2:] date2 = '1231' + year2[2:] # Get bill list bill_list_url = 'http://www.legis.state.ak.us/'\ 'basis/range_multi.asp?session=%s&date1=%s&date2=%s' % ( session, date1, date2) self.log("Getting bill list for %s %s (this may take a long time)." % (chamber, session)) bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Find bill links re_str = "bill=%s\d+" % bill_abbr links = bill_list.findAll(href=re.compile(re_str)) for link in links: bill_id = link.contents[0].replace(' ', '') bill_name = link.parent.parent.findNext('td').find( 'font').contents[0].strip() bill = Bill(session, chamber, bill_id, bill_name.strip()) # Get the bill info page and strip malformed t info_url = "http://www.legis.state.ak.us/basis/%s" % link['href'] info_page = self.soup_parser(self.urlopen(info_url)) bill.add_source(info_url) # Get sponsors spons_str = info_page.find( text="SPONSOR(s):").parent.parent.contents[1] sponsors_match = re.match( ' (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') bill.add_sponsor('primary', sponsors[0].strip()) for sponsor in sponsors[1:]: bill.add_sponsor('cosponsor', sponsor.strip()) else: # Committee sponsorship bill.add_sponsor('committee', spons_str.strip()) # Get actions act_rows = info_page.findAll('table', 'myth')[1].findAll('tr')[1:] for row in act_rows: cols = row.findAll('td') act_date = cols[0].font.contents[0] act_date = dt.datetime.strptime(act_date, '%m/%d/%y') if cols[2].font.string == "(H)": act_chamber = "lower" elif cols[2].font.string == "(S)": act_chamber = "upper" else: act_chamber = chamber action = cols[3].font.contents[0].strip() if re.match("\w+ Y(\d+) N(\d+)", action): try: vote = self.parse_vote(bill, action, act_chamber, act_date, cols[1].a['href']) bill.add_vote(vote) except: self.log("Failed parsing vote at %s" % cols[1].a['href']) bill.add_action(act_chamber, action, act_date) # Get subjects bill['subjects'] = [] subject_link_re = re.compile('.*subject=\w+$') for subject_link in info_page.findAll('a', href=subject_link_re): subject = subject_link.contents[0].strip() bill['subjects'].append(subject) # Get versions text_list_url = "http://www.legis.state.ak.us/"\ "basis/get_fulltext.asp?session=%s&bill=%s" % ( session, bill_id) text_list = self.soup_parser(self.urlopen(text_list_url)) bill.add_source(text_list_url) text_link_re = re.compile('^get_bill_text?') for text_link in text_list.findAll('a', href=text_link_re): text_name = text_link.parent.previousSibling.contents[0] text_name = text_name.strip() text_url = "http://www.legis.state.ak.us/basis/%s" % ( text_link['href']) bill.add_version(text_name, text_url) self.save_bill(bill)
def scrape(self, chamber, session): bill_search_url = 'http://www.camaraderepresentantes.org/cr_buscar.asp' bill_types = {'Project':'P', 'Resolution':'R', \ 'Joint Resolution':'RC', \ 'Concurrent Resolution':'RK', \ 'Appointment':'N'} #bodies = {'upper':'S', 'lower':'C'} bodies = {'upper':'S'} bill_search_page = lxml.html.parse(bill_search_url).getroot() search_form = bill_search_page.forms[0] for body in bodies.itervalues(): for bill_type in bill_types.itervalues(): search_form.fields['cuerpo'] = body search_form.fields['tipo'] = bill_type search_form.fields['autor'] = 'NA' if year_from_session(session) == '2009': search_form.fields['f2'] = '12/31/2009' elif year_from_session(session) == '2010': search_form.fields['f1'] = '01/01/2010' result = lxml.html.parse(lxml.html.submit_form(search_form)).getroot() table_elements = result.cssselect('table') table_elements.pop() bill_elements = grouper(3, table_elements) for actions, complete_data, bill_data in bill_elements: td_elements = bill_data.cssselect('td') title = td_elements[1].text_content() date = td_elements[3].text_content() description = td_elements[5].text_content() authors = td_elements[7].text_content().split('/') bill = Bill(session, chamber, title, description) for author in authors: if len(authors) == 1: bill.add_sponsor('primary', author) else: bill.add_sponsor('cosponsor', author) td_elements = actions.cssselect('td') td_elements = td_elements[4:-1] action_elements = grouper(3, td_elements) for date_element, action, empty in action_elements: # Clean unicode character date_text = date_element.text_content().replace(u'\xa0',u'') date = dt.datetime.strptime(date_text, '%m/%d/%Y') action_text = action.text_content() try: doc_link_part = action.iterlinks().next()[2] if 'voto' in doc_link_part: raise doc_link = doc_link_url(doc_link_part) bill.add_version((action_text, doc_link)) except: pass bill.add_action(chamber, action_text, date)