def scrape_legislators(self, chamber, year): if year not in self.metadata['session_details']: raise NoDataForYear(year) if chamber == 'lower': title = 'Representative' else: title = 'Senator' url = 'http://www.le.state.ut.us/asp/roster/roster.asp?year=%s' % year leg_list = self.soup_parser(self.urlopen(url)) for row in leg_list.findAll('table')[1].findAll('tr')[1:]: tds = row.findAll('td') leg_title = tds[1].find(text=True) if leg_title == title: fullname = tds[0].find(text=True) last_name = fullname.split(',')[0] first_name = fullname.split(' ')[1] if len(fullname.split(' ')) > 2: middle_name = fullname.split(' ')[2] leg = Legislator(year, chamber, tds[3].find(text=True), fullname, first_name, last_name, middle_name, tds[2].find(text=True)) leg.add_source(url) self.add_legislator(leg)
def scrape_legislators(self, chamber, year): if year != "2009": raise NoDataForYear(year) session = "%d-%d" % (int(year), int(year) + 1) url = "http://www.ncga.state.nc.us/gascripts/members/" "memberList.pl?sChamber=" if chamber == "lower": url += "House" else: url += "Senate" with self.urlopen_context(url) as leg_list_data: leg_list = self.soup_parser(leg_list_data) leg_table = leg_list.find("div", id="mainBody").find("table") for row in leg_table.findAll("tr")[1:]: party = row.td.contents[0].strip() if party == "Dem": party = "Democrat" elif party == "Rep": party = "Republican" district = row.findAll("td")[1].contents[0].strip() full_name = row.findAll("td")[2].a.contents[0].strip() full_name = full_name.replace(u"\u00a0", " ") (first_name, last_name, middle_name, suffix) = split_name(full_name) legislator = Legislator( session, chamber, district, full_name, first_name, last_name, middle_name, party, suffix=suffix ) legislator.add_source(url) self.add_legislator(legislator)
def scrape_reps(self, year): if year != '2009': return leg_page_url = "http://www.flhouse.gov/Sections/Representatives/"\ "representatives.aspx" leg_page = BeautifulSoup(self.urlopen(leg_page_url)) table = leg_page.find('table', id='ctl00_ContentPlaceHolder1_ctrlContentBox'\ '_ctrlPageContent_ctl00_dgLegislators') for row in table.findAll('tr')[1:]: full = row.findAll('td')[1].a.contents[0].replace(' ', ' ') (last, first, middle) = self.split_name(full) district = row.findAll('td')[3].contents[0] party = row.findAll('td')[2].contents[0] if party == 'D': party = 'Democrat' elif party == 'R': party = 'Republican' leg = Legislator(year, 'lower', district, full, first, last, middle, party) leg.add_source(leg_page_url) self.add_legislator(leg)
def scrape_reps(self, year): rep_url = 'http://www.house.state.tx.us/members/welcome.php' with self.urlopen_context(rep_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) for el in root.xpath('//form[@name="frmMembers"]/table/tr')[1:]: full_name = el.xpath('string(td/a/font/span)') district = el.xpath('string(td[2]/span)') county = el.xpath('string(td[3]/span)') if full_name.startswith('District'): # Ignore empty seats continue pre, first, last, suffixes = name_tools.split(full_name) party = '' leg = Legislator('81', 'lower', district, full_name, first, last, '', party, suffix=suffixes) leg.add_source(rep_url) # Is there anything out there that handles meta refresh? redirect_url = el.xpath('td/a')[0].attrib['href'] redirect_url = ('http://www.house.state.tx.us/members/' + redirect_url) details_url = redirect_url with self.urlopen_context(redirect_url) as redirect_page: redirect = lxml.etree.fromstring(redirect_page, lxml.etree.HTMLParser()) try: filename = redirect.xpath( "//meta[@http-equiv='refresh']" )[0].attrib['content'] filename = filename.split('0;URL=')[1] details_url = details_url.replace('welcome.htm', filename) except: # The Speaker's member page does not redirect. # The Speaker is not on any committees # so we can just continue with the next member. self.save_legislator(leg) continue with self.urlopen_context(details_url) as details_page: details = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) comms = details.xpath( "//b[contains(text(), 'Committee Assignments')]/" "..//a") for comm in comms: leg.add_role('committee member', '81', committee=comm.text.strip()) self.save_legislator(leg)
def scrape_senators(self, year): senator_url = 'http://www.senate.state.tx.us/75r/senate/senmem.htm' with self.urlopen_context(senator_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) for el in root.xpath('//table[@summary="senator identification"]'): sen_link = el.xpath('tr/td[@headers="senator"]/a')[0] full_name = sen_link.text district = el.xpath('string(tr/td[@headers="district"])') party = el.xpath('string(tr/td[@headers="party"])') pre, first, last, suffixes = name_tools.split(full_name) leg = Legislator('81', 'upper', district, full_name, first, last, '', party, suffix=suffixes) leg.add_source(senator_url) details_url = ('http://www.senate.state.tx.us/75r/senate/' + sen_link.attrib['href']) with self.urlopen_context(details_url) as details_page: details = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) comms = details.xpath("//h2[contains(text(), 'Committee Membership')]")[0] comms = comms.getnext() for comm in comms.xpath('li/a'): comm_name = comm.text if comm.tail: comm_name += comm.tail leg.add_role('committee member', '81', committee=comm_name.strip()) self.save_legislator(leg)
def scrape_legislators(self, chamber, year): year = int(year) session = self.internal_sessions[year][0][1] # iterating through subsessions would be a better way to do this.. if year % 2 == 0 and (year != dt.date.today().year or year + 1 != dt.date.today().year): raise NoDataForYear(year) if chamber == "upper": url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=senate" else: url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=assembly" body = unicode(self.urlopen(url), "latin-1") page = lxml.html.fromstring(body) for row in page.cssselect("#ctl00_C_dgLegData tr"): if len(row.cssselect("td a")) > 0: rep_url = list(row)[0].cssselect("a[href]")[0].get("href") (full_name, party) = re.findall(r"([\w\-\,\s\.]+)\s+\(([\w])\)", list(row)[0].text_content())[0] pre, first, last, suffixes = name_tools.split(full_name) district = str(int(list(row)[2].text_content())) leg = Legislator(session, chamber, district, full_name, first, last, "", party, suffix=suffixes) leg.add_source(rep_url) leg = self.add_committees(leg, rep_url, session) self.save_legislator(leg)
def scrape_pre_2003_legislators(self, chamber, year, session, suffix): url = 'http://leg.mt.gov/css/Sessions/%d%s/legname.asp' % (session, suffix) page_data = self.parser(self.urlopen(url)) if year == 2001: if chamber == 'upper': tableName = '57th Legislatore Roster Senate (2001-2002)' startRow = 3 else: tableName = '57th Legislator Roster (House)(2001-2002)' startRow = 5 elif year == 1999: if chamber == 'upper': tableName = 'Members of the Senate' startRow = 3 else: tableName = 'Members of the House' startRow = 5 for row in page_data.find('table', attrs = {'name' : tableName}).findAll('tr')[startRow:]: row = row.findAll('td') #Ignore row with just email in it if str(row[0].contents[0]).strip() == ' ': continue #Parse different locations for name if name is a link if row[0].find('a'): name = row[0].contents[0].next #print name.next party_letter = name.next[2] else: if chamber == 'upper' and year == 2001: name, party_letter = row[0].contents[2].rsplit(' (', 1) else: name, party_letter = row[0].contents[0].rsplit(' (', 1) party_letter = party_letter[0] #Get first name, last name, and suffix out of name string nameParts = [namePart.strip() for namePart in name.split(',')] assert len(nameParts) < 4 if len(nameParts) == 2: #Case last_name, first_name last_name, first_name = nameParts elif len(nameParts) == 3: #Case last_name, suffix, first_name last_name = ' '.join(nameParts[0:2]) first_name = nameParts[2] district = row[2].contents[0].strip() if party_letter == 'R': party = 'Republican' elif party_letter == 'D': party = 'Democrat' else: #Haven't yet run into others, so not sure how the state abbreviates them party = party_letter legislator = Legislator(session, chamber, district, '%s %s' % (first_name, last_name), \ first_name, last_name, '', party) legislator.add_source(url) self.add_legislator(legislator)
def scrape_legislators(self, chamber, year): if int(year) != 2009: return session = "%s-%d" % (year, int(year) + 1) # What Vermont claims are Word and Excel files are actually # just HTML tables # What Vermont claims is a CSV file is actually one row of comma # separated values followed by a ColdFusion error. leg_url = "http://www.leg.state.vt.us/legdir/"\ "memberdata.cfm/memberdata.doc?FileType=W" leg_table = BeautifulSoup(self.urlopen(leg_url)) for tr in leg_table.findAll('tr')[1:]: leg_cham = tr.findAll('td')[3].contents[0] if leg_cham == 'H' and chamber == 'upper': continue if leg_cham == 'S' and chamber == 'lower': continue district = tr.findAll('td')[5].contents[0] district = district.replace(' District', '').strip() first = tr.findAll('td')[6].contents[0] middle = tr.findAll('td')[7] if len(middle.contents) == 0: middle = '' else: middle = middle.contents[0].strip() last = tr.findAll('td')[8].contents[0] if len(middle) == 0: full = "%s, %s" % (last, first) else: full = "%s, %s %s." % (last, first, middle) official_email = tr.findAll('td')[9] if len(official_email.contents) == 0: official_email = '' else: official_email = official_email.contents[0] party = tr.findAll('td')[4].contents[0] if party == 'D': party = 'Democrat' elif party == 'R': party = 'Republican' elif party == 'I': party = 'Independent' elif party == 'P': party = 'Progressive' leg = Legislator(session, chamber, district, full, first, last, middle, party, official_email=official_email) leg.add_source(leg_url) self.save_legislator(leg)
def scrape_legislators(self, chamber, year): # Data available for 1993 on if int(year) < 1993 or int(year) > dt.date.today().year: raise NoDataForYear(year) # Expect first year of session (odd) if int(year) % 2 != 1: raise NoDataForYear(year) if chamber == 'upper': chamber_abbr = 'S' else: chamber_abbr = 'H' session = str(18 + ((int(year) - 1993) / 2)) leg_list_url = "http://www.legis.state.ak.us/"\ "basis/commbr_info.asp?session=%s" % session leg_list = self.soup_parser(self.urlopen(leg_list_url)) leg_re = "get_mbr_info.asp\?member=.+&house=%s&session=%s" % ( chamber_abbr, session) links = leg_list.findAll(href=re.compile(leg_re)) for link in links: member_url = "http://www.legis.state.ak.us/basis/" + link['href'] member_page = self.soup_parser(self.urlopen(member_url)) if member_page.find('td', text=re.compile('Resigned')): # Need a better way to handle this than just dropping continue full_name = member_page.findAll('h3')[1].contents[0] full_name = ' '.join(full_name.split(' ')[1:]) full_name = re.sub('\s+', ' ', full_name).strip() first_name = full_name.split(' ')[0] last_name = full_name.split(' ')[-1] middle_name = ' '.join(full_name.split(' ')[1:-1]) code = link['href'][24:27] district = member_page.find(text=re.compile("District:")) district = district.strip().split(' ')[-1] party = member_page.find(text=re.compile("Party: ")) party = ' '.join(party.split(' ')[1:]) leg = Legislator(session, chamber, district, full_name, first_name, last_name, middle_name, party, code=code) leg.add_source(member_url) self.save_legislator(leg)
def fetch_member(self, url, name, session, chamber): abbr = {'R': 'Republican', 'D': 'Democrat', 'I': 'Independent'} url = "http://leg1.state.va.us/%s" % url with self.soup_context(url) as member: ex = member.findAll('table', text=re.compile(re.escape(name))) if ex == []: raise Exception("Parse error fetching member %s" % name) else: ex = ex[0].parent.nextSibling.nextSibling.string.split() # Some people are "Joe X. Schmoe;Resigned". Fantastic. name = re.split('\;|\(', name)[0] # some other people are Joe X. Schmoe (resigned name_parts = name.split() first_name = name_parts[0] last = name_parts[len(name_parts)-1] if re.match(r'[IV]+$|\bJr\b\.$|\b(Sr)\b\.$', last): last_name = name_parts[len(name_parts)-2] else: last_name = last if name_parts[1] == last_name: middle_name = '' else: middle_name = name_parts[1] # Deal with the Van Houtens of the world # also, watch out for their rugged Danish relatives... if name_parts[1] == 'Van': middle_name = '' last_name = name_parts[1] + ' ' + last_name last_name = last_name.replace(',','') middle_name = middle_name.replace('.', '') party = ex[0][1] district = ex[len(ex)-1] leg = Legislator(session=session, chamber=chamber, district=district, full_name=name.strip(), first_name=first_name.strip(), last_name=last_name.strip(), middle_name=middle_name.replace('.', '').strip(), party=abbr[party]) leg.add_source(url) # [_,_,district,_] # so... yeah. not totally sure how I should handle legislators in subsessions # but I'll only add them if the matcher doesn't already know about them. sanitized = leg['full_name'].replace('.', '').lower() if self.matcher[chamber][sanitized] and self.matcher[chamber][sanitized][2] == district: return self.save_legislator(leg)
def scrape_new_legislators(self, chamber, session): """ Scrape legislators from 2009 and later. """ if chamber == 'upper': search = 'Senate Members' else: search = 'House Members' leg_list_url = "http://legis.state.sd.us/sessions/%s/"\ "MemberMenu.aspx" % (session) leg_list = self.soup_parser(self.urlopen(leg_list_url)) list_div = leg_list.find(text=search).findNext('div') for link in list_div.findAll('a'): full_name = link.contents[0].strip() first_name = full_name.split(', ')[1].split(' ')[0] last_name = full_name.split(',')[0] middle_name = '' leg_page_url = "http://legis.state.sd.us/sessions/%s/%s" % ( session, link['href']) leg_page = self.soup_parser(self.urlopen(leg_page_url)) party = leg_page.find( id="ctl00_contentMain_spanParty").contents[0].strip() district = leg_page.find( id="ctl00_contentMain_spanDistrict").contents[0] district = district.strip().lstrip('0') occ_span = leg_page.find(id="ctl00_contentMain_spanOccupation") if len(occ_span.contents) > 0: occupation = occ_span.contents[0].strip() else: occupation = None legislator = Legislator(session, chamber, district, full_name, first_name, last_name, middle_name, party, occupation=occupation) legislator.add_source(leg_page_url) self.save_legislator(legislator)
def parse_legislator(self, chamber, year, full_name, district, url): with self.soup_context(url) as leg_page: name_str = leg_page.find("strong").contents[0].strip() if name_str.endswith("(D)"): party = "Democrat" elif name_str.endswith("(R)"): party = "Republican" elif name_str.endswith("(I)"): party = "Independent" else: party = "Other" full_name = full_name.replace("\n", "").replace(""", '"') full_name = full_name.replace("\t", "").replace("\r", "") (first_name, last_name, middle_name) = split_name(full_name) legislator = Legislator(year, chamber, district, full_name, first_name, last_name, middle_name, party) legislator.add_source(url) self.save_legislator(legislator)
def parse_legislator(self, chamber, year, full_name, district, url): with self.soup_context(url) as leg_page: name_str = leg_page.find('strong').contents[0].strip() if name_str.endswith('(D)'): party = 'Democrat' elif name_str.endswith('(R)'): party = 'Republican' elif name_str.endswith('(I)'): party = 'Independent' else: party = 'Other' full_name = full_name.replace('\n', '').replace('"', '"') full_name = full_name.replace('\t', '').replace('\r', '') (first_name, last_name, middle_name) = split_name(full_name) legislator = Legislator(year, chamber, district, full_name, first_name, last_name, middle_name, party) legislator.add_source(url) self.add_legislator(legislator)
def scrape_senators(self, year): if year != '2009': return leg_page_url = "http://www.flsenate.gov/Legislators/"\ "index.cfm?Mode=Member%20Pages&Submenu=1&Tab=legislators" leg_page = BeautifulSoup(self.urlopen(leg_page_url)) th = leg_page.find('th', text='Legislator').parent table = th.parent.parent for row in table.findAll('tr')[1:]: full = row.td.a.contents[0].replace(' ', ' ') (last, first, middle) = self.split_name(full) district = row.findAll('td')[1].contents[0] party = row.findAll('td')[2].contents[0] leg = Legislator(year, 'upper', district, full, first, last, middle, party) leg.add_source(leg_page_url) self.add_legislator(leg)
def scrape_old_legislators(self, chamber, session): """ Scrape pre-2009 legislators. """ if chamber == 'upper': chamber_name = 'Senate' else: chamber_name = 'House' if int(session) < 2008: filename = 'district.htm' else: filename = 'MembersDistrict.htm' leg_list_url = "http://legis.state.sd.us/sessions/%s/%s" % ( session, filename) leg_list = self.soup_parser(self.urlopen(leg_list_url)) for district_str in leg_list.findAll('h2'): district = district_str.contents[0].split(' ')[1].lstrip('0') for row in district_str.findNext('table').findAll('tr')[1:]: if row.findAll('td')[1].contents[0].strip() != chamber_name: continue full_name = row.td.a.contents[0].strip() first_name = full_name.split(', ')[1].split(' ')[0] last_name = full_name.split(',')[0] middle_name = '' party = row.findAll('td')[3].contents[0].strip() occupation = row.findAll('td')[4].contents[0].strip() legislator = Legislator(session, chamber, district, full_name, first_name, last_name, middle_name, party=party, occupation=occupation) legislator.add_source(leg_list_url) self.save_legislator(legislator)
def scrape_post_2003_legislators(self, chamber, year, session, suffix): url = 'http://leg.mt.gov/content/sessions/%d%s/%d%sMembers.txt' % \ (session, suffix, year, chamber == 'upper' and 'Senate' or 'House') #Currently 2009 is different if year > 2008: csv_parser = csv.reader(self.urlopen(url).split(os.linesep), delimiter = '\t') #Discard title row csv_parser.next() else: csv_parser = csv.reader(self.urlopen(url).split(os.linesep)) for entry in csv_parser: if not entry: continue if year == 2003: first_name, last_name = entry[0].split(' ', 2)[1:3] party_letter = entry[1] district = entry[2] else: last_name = entry[0] first_name = entry[1] party_letter = entry[2] district = entry[3]#.split('D ')[1] if party_letter == '(R)': party = 'Republican' elif party_letter == '(D)': party = 'Democrat' else: party = party_letter first_name = first_name.capitalize() last_name = last_name.capitalize() #All we care about is the number district = district.split('D ')[1] legislator = Legislator(session, chamber, district, '%s %s' % (first_name, last_name), \ first_name, last_name, '', party) legislator.add_source(url) self.add_legislator(legislator)
def scrape_legislators(self, chamber, year): # Pennsylvania doesn't make member lists easily available # for previous sessions, unfortunately if int(year) < 2009: #raise NoDataForYear(year) return session = "%s-%d" % (year, int(year) + 1) leg_list_url = legislators_url(chamber) with self.soup_context(leg_list_url) as member_list_page: for link in member_list_page.findAll( 'a', href=re.compile('_bio\.cfm\?id=')): full_name = link.contents[0][0:-4] last_name = full_name.split(',')[0] first_name = full_name.split(' ')[1] if len(full_name.split(' ')) > 2: middle_name = full_name.split(' ')[2].strip(',') else: middle_name = '' party = link.contents[0][-2] if party == 'R': party = "Republican" elif party == 'D': party = "Democrat" district = re.search( "District (\d+)", link.parent.contents[1]).group(1) legislator = Legislator(session, chamber, district, full_name, first_name, last_name, middle_name, party) legislator.add_source(leg_list_url) self.add_legislator(legislator)
def scrape_legislators(self, chamber, year): if year != '2009': raise NoDataForYear(year) session = "%d-%d" % (int(year), int(year) + 1) url = "http://www.ncga.state.nc.us/gascripts/members/"\ "memberList.pl?sChamber=" if chamber == 'lower': url += 'House' else: url += 'Senate' with self.urlopen_context(url) as leg_list_data: leg_list = self.soup_parser(leg_list_data) leg_table = leg_list.find('div', id='mainBody').find('table') for row in leg_table.findAll('tr')[1:]: party = row.td.contents[0].strip() if party == 'Dem': party = 'Democrat' elif party == 'Rep': party = 'Republican' district = row.findAll('td')[1].contents[0].strip() full_name = row.findAll('td')[2].a.contents[0].strip() full_name = full_name.replace(u'\u00a0', ' ') (first_name, last_name, middle_name, suffix) = split_name( full_name) legislator = Legislator(session, chamber, district, full_name, first_name, last_name, middle_name, party, suffix=suffix) legislator.add_source(url) self.save_legislator(legislator)
def scrape_legislators(self, chamber, year): if year != "2009": raise NoDataForYear l1 = Legislator("2009-2010", chamber, "1st", "Bob Smith", "Bob", "Smith", "", "Democrat") if chamber == "upper": l1.add_role("President of the Senate", "2009-2010") else: l1.add_role("Speaker of the House", "2009-2010") l1.add_source("http://example.com/Bob_Smith.html") l2 = Legislator("2009-2010", chamber, "2nd", "Sally Johnson", "Sally", "Johnson", "", "Republican") l2.add_role("Minority Leader", "2009-2010") l2.add_source("http://example.com/Sally_Johnson.html") self.save_legislator(l1) self.save_legislator(l2)
def scrape_legislators(self, chamber, year): """ Scrape the ND legislators seated in a given chamber during a given year. """ # Error checking if year not in self.metadata['session_details']: raise NoDataForYear(year) # No legislator data for 1997 (though other data is available) if year == '1997': raise NoDataForYear(year) # URL building if chamber == 'upper': url_chamber_name = 'senate' norm_chamber_name = 'Senate' url_member_name = 'senators' else: url_chamber_name = 'house' norm_chamber_name = 'House' url_member_name = 'representatives' assembly_url = '/assembly/%i-%s/%s' % ( self.metadata['session_details'][str(year)]['number'], year, url_chamber_name) list_url = \ self.site_root + \ assembly_url + \ '/members/last-name.html' # Parsing soup = self.parser.parse(self.urlopen(list_url)) if not soup: raise ScrapeError('Failed to parse legaslative list page.') header = soup.find('h2') if not header: raise ScrapeError('Legaslative list header element not found.') party_images = {'/images/donkey.gif': 'Democrat', '/images/elephant.gif': 'Republican'} for row in header.findNextSibling('table').findAll('tr'): cells = row.findAll('td') party = party_images[cells[0].img['src']] name = map(lambda x: x.strip(), cells[1].a.contents[0].split(', ')) name.reverse() name = ' '.join(name) district = re.findall('District (\d+)', cells[2].contents[0])[0] attributes = { 'session': year, 'chamber': chamber, 'district': district, 'party': party, 'full_name': name, } split_name = name.split(' ') if len(split_name) > 2: attributes['first_name'] = split_name[0] attributes['middle_name'] = split_name[1].strip(' .') attributes['last_name'] = split_name[2] else: attributes['first_name'] = split_name[0] attributes['middle_name'] = u'' attributes['last_name'] = split_name[1] # we can get some more data.. bio_url = self.site_root + cells[1].a['href'] try: attributes.update(self.scrape_legislator_bio(bio_url)) except urllib2.HTTPError: self.log("failed to fetch %s" % bio_url) self.debug("attributes: %d", len(attributes)) self.debug(attributes) # Save legislator = Legislator(**attributes) legislator.add_source(bio_url) self.save_legislator(legislator)