def scrape(self, chamber, session): year = year_from_session(session) session = "%s-%d" % (year, int(year) + 1) if year >= 2009: self.scrape_session_2009(chamber, session) else: self.scrape_session_old(chamber, session)
def scrape(self, chamber, session): # All other years are stored in a pdf # http://www.capitol.hawaii.gov/session2009/misc/statehood.pdf if year_from_session(session) != 2009: raise NoDataForPeriod(session) legislators_page_url = legs_url(chamber) with self.urlopen(legislators_page_url) as legislators_page_html: legislators_page = lxml.html.fromstring(legislators_page_html) legislators_table = legislators_page.cssselect('table') # Get the first table legislators_table = legislators_table[0] legislators_data = legislators_table.cssselect('tr') # Eliminate non-legislator element legislators_data.pop(0) # Group legislator data legislators_data = grouper(3, legislators_data) for name_and_party, district, email in legislators_data: element, attribute, link, pos = name_and_party.iterlinks().next() source = base_url() + link name_and_party = name_and_party.cssselect('td') name_and_party = name_and_party[0] name, sep, party = name_and_party.text_content().partition("(") # remove space at the beginning name = name.lstrip() if party == 'R)': party = 'Republican' else: party = 'Democrat' district = district.cssselect('td') district = district[1] district = district.text_content() email = email.cssselect('a') email = email[0] email = email.text_content() # Remove white space email = email.lstrip() leg = Legislator(session, chamber, district, name, "", "", "", party, official_email=email) leg.add_source(source) self.save_legislator(leg)
def scrape(self, chamber, session): # All other years are stored in a pdf # http://www.capitol.hawaii.gov/session2009/misc/statehood.pdf if year_from_session(session) != 2009: raise NoDataForPeriod(session) if chamber == 'upper': legislators_page_url = BASE_URL + "/site1/info/direct/sendir.asp" else: legislators_page_url = BASE_URL + "/site1/info/direct/repdir.asp" with self.urlopen(legislators_page_url) as legislators_page_html: legislators_page = lxml.html.fromstring(legislators_page_html) # get all rows (except first) of first table legislators_data = legislators_page.xpath('//table[1]/tr')[1:] # group legislator data in sets of 3 legislators_data = grouper(3, legislators_data) for name_and_party, district, email in legislators_data: element, attribute, link, pos = name_and_party.iterlinks().next() source = BASE_URL + link name_and_party = name_and_party.cssselect('td') name_and_party = name_and_party[0] name, sep, party = name_and_party.text_content().partition("(") # remove space at the beginning name = name.strip() if party == 'R)': party = 'Republican' else: party = 'Democratic' district = district.cssselect('td') district = district[1] district = district.text_content() email = email.cssselect('a') email = email[0] email = email.text_content() # Remove white space email = email.strip() leg = Legislator(session, chamber, district, name, party=party, official_email=email) leg.add_source(source) self.save_legislator(leg)