def scrape(self, chamber, term): # Pennsylvania doesn't make member lists easily available # for previous sessions, unfortunately if term != '2009-2010': raise NoDataForPeriod(term) leg_list_url = legislators_url(chamber) with self.urlopen(leg_list_url) as page: page = lxml.html.fromstring(page) for link in page.xpath("//a[contains(@href, '_bio.cfm')]"): full_name = link.text[0:-4] district = re.search("District (\d+)", link.tail).group(1) party = link.text[-2] if party == 'R': party = 'Republican' elif party == 'D': party = 'Democratic' legislator = Legislator(term, chamber, district, full_name, party=party) legislator.add_source(leg_list_url) self.save_legislator(legislator)
def scrape(self, chamber, year): # Pennsylvania doesn't make member lists easily available # for previous sessions, unfortunately if int(year) < 2009: #raise NoDataForYear(year) return session = "%s-%d" % (year, int(year) + 1) leg_list_url = legislators_url(chamber) with self.urlopen(leg_list_url) as member_list_page: member_list_page = BeautifulSoup(member_list_page) for link in member_list_page.findAll( 'a', href=re.compile('_bio\.cfm\?id=')): full_name = link.contents[0][0:-4] last_name = full_name.split(',')[0] first_name = full_name.split(' ')[1] if len(full_name.split(' ')) > 2: middle_name = full_name.split(' ')[2].strip(',') else: middle_name = '' party = link.contents[0][-2] if party == 'R': party = "Republican" elif party == 'D': party = "Democrat" district = re.search( "District (\d+)", link.parent.contents[1]).group(1) legislator = Legislator(session, chamber, district, full_name, first_name, last_name, middle_name, party) legislator.add_source(leg_list_url) self.save_legislator(legislator)