def scrape_old_legislators(self, chamber, session): """ Scrape pre-2009 legislators. """ if chamber == "upper": chamber_name = "Senate" else: chamber_name = "House" if int(session) < 2008: filename = "district.htm" else: filename = "MembersDistrict.htm" leg_list_url = "http://legis.state.sd.us/sessions/%s/%s" % (session, filename) leg_list = self.soup_parser(self.urlopen(leg_list_url)) for district_str in leg_list.findAll("h2"): district = district_str.contents[0].split(" ")[1].lstrip("0") for row in district_str.findNext("table").findAll("tr")[1:]: if row.findAll("td")[1].contents[0].strip() != chamber_name: continue full_name = row.td.a.contents[0].strip() party = row.findAll("td")[3].contents[0].strip() occupation = row.findAll("td")[4].contents[0].strip() legislator = Legislator(session, chamber, district, full_name, party=party, occupation=occupation) legislator.add_source(leg_list_url) self.save_legislator(legislator)
def scrape_new_legislators(self, chamber, session): """ Scrape legislators from 2009 and later. """ if chamber == "upper": search = "Senate Members" else: search = "House Members" leg_list_url = "http://legis.state.sd.us/sessions/%s/" "MemberMenu.aspx" % (session) leg_list = self.soup_parser(self.urlopen(leg_list_url)) list_div = leg_list.find(text=search).findNext("div") for link in list_div.findAll("a"): full_name = link.contents[0].strip() leg_page_url = "http://legis.state.sd.us/sessions/%s/%s" % (session, link["href"]) leg_page = self.soup_parser(self.urlopen(leg_page_url)) party = leg_page.find(id="ctl00_contentMain_spanParty").contents[0].strip() district = leg_page.find(id="ctl00_contentMain_spanDistrict").contents[0] district = district.strip().lstrip("0") occ_span = leg_page.find(id="ctl00_contentMain_spanOccupation") if len(occ_span.contents) > 0: occupation = occ_span.contents[0].strip() else: occupation = None legislator = Legislator(session, chamber, district, full_name, party=party, occupation=occupation) legislator.add_source(leg_page_url) self.save_legislator(legislator)
def scrape_senators(self, chamber, term): sen_url = 'http://www.ohiosenate.gov/directory.html' with self.urlopen(sen_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) for el in root.xpath('//table[@class="fullWidth"]/tr/td'): sen_link = el.xpath('a[@class="senatorLN"]')[1] full_name = sen_link.text full_name = full_name[0:-2] if full_name == 'To Be Announced': continue district = el.xpath('string(h3)').split()[1] party = el.xpath('string(a[@class="senatorLN"]/span)') if party == "D": party = "Democrat" elif party == "R": party = "Republican" leg = Legislator(term, chamber, district, full_name, '', '', '', party) leg.add_source(sen_url) self.save_legislator(leg)
def scrape_senate(self, term): url = 'http://www.senate.leg.state.mn.us/members/member_list.php' with self.urlopen(url) as html: doc = lxml.html.fromstring(html) for row in doc.xpath('//tr'): tds = row.xpath('td') if len(tds) == 5 and tds[1].text_content() in self._parties: district = tds[0].text_content() party = tds[1].text_content() name_a = tds[2].xpath('a')[0] name = name_a.text.strip() addr, phone = tds[3].text_content().split(u'\xa0\xa0') email = tds[4].text_content() leg = Legislator(term, 'upper', district, name, party=self._parties[party], office_address=addr, office_phone=phone) if '@' in email: leg['email'] = email leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): # this beautiful page is loaded from the council page via AJAX url = 'http://www.dccouncil.washington.dc.us/include/linkedpage.aspx?linkedpage=2&page=17' # do nothing if they're trying to get a lower chamber if chamber == 'lower': return with self.urlopen(url) as data: base_doc = lxml.html.fromstring(data) for link in base_doc.xpath('//a'): leg_url = 'http://www.dccouncil.washington.dc.us/' + link.get('href') with self.urlopen(leg_url) as leg_html: doc = lxml.html.fromstring(leg_html) name = link.text # Name, District title = doc.get_element_by_id('PageTitle') district = title.text.rsplit(', ')[-1] # party party = get_surrounding_block(doc, 'Political Affiliation') if 'Democratic' in party: party = 'Democratic' else: party = 'Independent' legislator = Legislator(term, 'upper', district, name, party=party) legislator.add_source(leg_url) self.save_legislator(legislator)
def scrape_senators(self, chamber, year): sen_url = 'http://www.ohiosenate.gov/directory.html' with self.urlopen(sen_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) for el in root.xpath('//table[@class="fullWidth"]/tr/td'): sen_link = el.xpath('a[@class="senatorLN"]')[1] full_name = sen_link.text full_name = full_name[0 : len(full_name) - 2] district = el.xpath('string(h3)') district = district.split()[1] party = el.xpath('string(a[@class="senatorLN"]/span)') first_name = full_name.split()[0] last_name = full_name.split()[1] middle_name = '' leg = Legislator('128', chamber, district, full_name, first_name, last_name, middle_name, party) leg.add_source(sen_url) self.save_legislator(leg)
def scrape_reps(self, chamber, session, term_name): # There is only 99 districts for district in range(1,100): rep_url = 'http://www.house.state.oh.us/components/com_displaymembers/page.php?district=' + str(district) with self.urlopen(rep_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) for el in root.xpath('//table[@class="page"]'): rep_link = el.xpath('tr/td/title')[0] full_name = rep_link.text party = full_name[-2] full_name = full_name[0 : len(full_name)-3] first_name = "" last_name = "" middle_name = "" if party == "D": party = "Democrat" elif party == "R": party = "Republican" else: party = party leg = Legislator(term_name, chamber, str(district), full_name, first_name, last_name, middle_name, party) leg.add_source(rep_url) self.save_legislator(leg)
def scrape_legislator_data(self, url, chamber): with self.lxml_context(url) as page: legislator_table = page.get_element_by_id("ctl00_PlaceHolderMain_dlMembers") legislators = legislator_table.cssselect('a') for legislator in legislators: name = legislator.text_content() full_name, first_name, middle_name, last_name = self.separate_name(name) name_for_url = last_name.lower() name_for_url = re.sub("'", "", name_for_url) if chamber == 'upper': legislator_page_url = "http://www.leg.wa.gov/senate/senators/Pages/" + name_for_url + ".aspx" else: legislator_page_url = "http://www.leg.wa.gov/house/representatives/Pages/" + name_for_url + ".aspx" with self.lxml_context(legislator_page_url) as legislator_page: try: full_name, first_name, middle_name, last_name = self.scrape_legislator_name(legislator_page) except: break party_element = legislator_page.get_element_by_id("ctl00_PlaceHolderMain_lblParty") if party_element.text_content() == '(R)': party = 'Republican' else: party = 'Democrat' district_element = legislator_page.get_element_by_id("ctl00_PlaceHolderMain_hlDistrict") district = district_element.text_content() legislator = Legislator('2009-2010', chamber, district, full_name, "", "", "", party) legislator.add_source(legislator_page_url) self.save_legislator(legislator)
def scrape_legislator_data(self, chamber, session): with self.urlopen(house_url(chamber)) as page_html: page = lxml.html.fromstring(page_html) legislator_table = page.get_element_by_id("ctl00_PlaceHolderMain_dlMembers") legislators = legislator_table.cssselect('a') for legislator in legislators: name = legislator.text_content() full_name, first_name, middle_name, last_name = separate_name(name) name_for_url = last_name.lower() name_for_url = re.sub("'", "", name_for_url) legislator_page_url = legs_url(chamber, name_for_url) with self.urlopen(legislator_page_url) as legislator_page_html: legislator_page = lxml.html.fromstring(legislator_page_html) try: full_name, first_name, middle_name, last_name = self.scrape_legislator_name(legislator_page) except: break party_element = legislator_page.get_element_by_id("ctl00_PlaceHolderMain_lblParty") if party_element.text_content() == '(R)': party = 'Republican' else: party = 'Democrat' district_element = legislator_page.get_element_by_id("ctl00_PlaceHolderMain_hlDistrict") district = district_element.text_content() legislator = Legislator(session, chamber, district, full_name, "", "", "", party) legislator.add_source(legislator_page_url) self.save_legislator(legislator)
def scrape_reps(self, chamber, session): rep_url = 'http://www.maine.gov/legis/house/dist_mem.htm' with self.urlopen(rep_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) #There are 151 districts for district in range(1, 152): if (district % 10) == 0: path = 'string(/html/body/p[%s]/a[3])' % (district+4) else: path = 'string(/html/body/p[%s]/a[2])' % (district+4) name = root.xpath(path) if len(name) > 0: if name.split()[0] != 'District': mark = name.find('(') party = name[mark + 1] name = name[15 : mark] firstname = "" lastname = "" middlename = "" if party == "V": name = "Vacant" leg = Legislator(session, chamber, district, name, firstname, lastname, middlename, party) leg.add_source(rep_url) self.save_legislator(leg)
def scrape_details(self, chamber, term, leg_name, leg_link, role): try: url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link with self.urlopen(url) as details_page: details_page = details_page.decode('latin1').encode('utf8', 'ignore') root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) party = root.xpath('string(//party)') district = root.xpath('string(//district)') first_name, middle_name, last_name = "", "", "" home_phone = root.xpath('string(//h_phone)') bis_phone = root.xpath('string(//b_phone)') capital_phone = root.xpath('string(//cap_phone)') other_phone = root.xpath('string(//oth_phone)') org_info = root.xpath('string(//org_info)') email_name = root.xpath('string(//email_address)') email = '%s@%s.ms.gov' % (email_name, chamber) if party == 'D': party = 'Democratic' else: party = 'Republican' leg = Legislator(term, chamber, district, leg_name, first_name, last_name, middle_name, party, role=role, home_phone = home_phone, bis_phone=bis_phone, capital_phone=capital_phone, other_phone=other_phone, org_info=org_info, email=email) leg.add_source(url) self.save_legislator(leg) except scrapelib.HTTPError, e: self.warning(str(e))
def scrape(self, chamber, year): session = "%d-%d" % (int(year), int(year) + 1) url = "http://www.ncga.state.nc.us/gascripts/members/"\ "memberList.pl?sChamber=" if chamber == 'lower': url += 'House' else: url += 'Senate' with self.urlopen(url) as (resp, data): leg_list = self.soup_parser(data) leg_table = leg_list.find('div', id='mainBody').find('table') for row in leg_table.findAll('tr')[1:]: party = row.td.contents[0].strip() if party == 'Dem': party = 'Democrat' elif party == 'Rep': party = 'Republican' district = row.findAll('td')[1].contents[0].strip() full_name = row.findAll('td')[2].a.contents[0].strip() full_name = full_name.replace(u'\u00a0', ' ') (first_name, last_name, middle_name, suffix) = split_name( full_name) legislator = Legislator(session, chamber, district, full_name, first_name, last_name, middle_name, party, suffix=suffix) legislator.add_source(url) self.save_legislator(legislator)
def scrape_reps(self, year): if year != '2009': return leg_page_url = "http://www.flhouse.gov/Sections/Representatives/"\ "representatives.aspx" leg_page = BeautifulSoup(self.urlopen(leg_page_url)) table = leg_page.find('table', id='ctl00_ContentPlaceHolder1_ctrlContentBox'\ '_ctrlPageContent_ctl00_dgLegislators') for row in table.findAll('tr')[1:]: full = row.findAll('td')[1].a.contents[0].replace(' ', ' ') district = row.findAll('td')[3].contents[0] party = row.findAll('td')[2].contents[0] if party == 'D': party = 'Democrat' elif party == 'R': party = 'Republican' leg = Legislator(year, 'lower', district, full, party=party) leg.add_source(leg_page_url) self.save_legislator(leg)
def scrape_rep(self, name, term, url): # special case names that confuses name_tools if name == 'Franklin, A.B.': name = 'Franklin, A. B.' elif ', Jr., ' in name: name.replace(', Jr., ', ' ') name += ', Jr.' elif ', III, ' in name: name.replace(', III, ', ' ') name += ', III' with self.urlopen(url) as text: page = lxml.html.fromstring(text) district = page.xpath( "//a[contains(@href, 'Maps')]")[0].attrib['href'] district = re.search("district(\d+).pdf", district).group(1) if "Democrat District" in text: party = "Democratic" elif "Republican District" in text: party = "Republican" elif "Independent District" in text: party = "Independent" else: party = "Other" leg = Legislator(term, 'lower', district, name, party=party) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, year): year = int(year) session = internal_sessions[year][0][1] # iterating through subsessions would be a better way to do this.. if year % 2 == 0 and (year != dt.date.today().year or year+1 != dt.date.today().year): raise NoDataForYear(year) if chamber == 'upper': url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=senate" else: url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=assembly" #body = unicode(self.urlopen(url), 'latin-1') with self.urlopen(url) as body: page = lxml.html.fromstring(body) for row in page.cssselect("#ctl00_C_dgLegData tr"): if len(row.cssselect("td a")) > 0: rep_url = list(row)[0].cssselect("a[href]")[0].get("href") legpart = re.findall(r'([\w\-\,\s\.]+)\s+\(([\w])\)', list(row)[0].text_content()) if legpart: full_name, party = legpart[0] district = str(int(list(row)[2].text_content())) leg = Legislator(session, chamber, district, full_name, party) leg.add_source(rep_url) leg = self.add_committees(leg, rep_url, session) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term) if chamber == 'lower': title = 'Representative' else: title = 'Senator' url = 'http://www.le.state.ut.us/asp/roster/roster.asp?year=%s' % term leg_list = self.soup_parser(self.urlopen(url)) for row in leg_list.findAll('table')[1].findAll('tr')[1:]: tds = row.findAll('td') leg_title = tds[1].find(text=True) if leg_title == title: fullname = tds[0].find(text=True) last_name = fullname.split(',')[0] first_name = fullname.split(' ')[1] if len(fullname.split(' ')) > 2: middle_name = fullname.split(' ')[2] leg = Legislator(term, chamber, tds[3].find(text=True), fullname, first_name, last_name, middle_name, tds[2].find(text=True)) leg.add_source(url) self.save_legislator(leg)
def scrape_legislator_data(self, url, chamber): party_fulls = {'R' : 'Republican', 'D' : 'Democrat'} with self.urlopen(url) as page: page = BeautifulSoup(page) for data in page.find('table', id = 'ctl00_mainCopy_DataList1')('td'): spans = data('span') if len(spans) == 0: self.debug('Found an empty cell in %s. Continuing' % url) continue full_name = ' '.join([span.string.strip() for span in spans]) if len(spans[0].string.strip().split()) == 2: first_name, middle_name = spans[0].string.strip().split() else: first_name, middle_name = spans[0].string.strip(), '' last_name = spans[1].string.strip() details_url = get_abs_url(url, data.find('a')['href']) with self.urlopen(details_url) as details: details = BeautifulSoup(details) district = details.find('a', id = 'ctl00_mainCopy_LegisInfo_DISTRICTLabel').string.strip() party = party_fulls[details.find('span', id = 'ctl00_mainCopy_LegisInfo_PARTYLabel').string] leg = Legislator('2010', chamber, district, full_name, first_name, last_name, middle_name, party) leg.add_source(details_url) comms_table = details.find('table', id = 'ctl00_mainCopy_MembershipGrid') for comms_raw_data in comms_table('tr')[1:]: comm_data = comms_raw_data('td') comm_role_type = comm_data[0].string.strip() comm_name = comm_data[1]('a')[0].string.strip() leg.add_role(comm_role_type, '2010', chamber = chamber, committee = comm_name) self.save_legislator(leg)
def scrape(self, chamber, term): urls = {'upper': 'http://www.legislature.state.al.us/senate/senators/senateroster_alpha.html', 'lower': 'http://www.legislature.state.al.us/house/representatives/houseroster_alpha.html'} url = urls[chamber] with self.urlopen(url) as html: doc = lxml.html.fromstring(html) for row in doc.xpath('//strong[starts-with(text(), "MEMBERS")]/following-sibling::table/tr')[1:]: name, party, district, office, phone = row.getchildren() # if the name column contains a link it isn't vacant link = name.xpath('a') if link: name = name.text_content() name = ' '.join(normalize_name(name)) party = party.text_content() district = district.text_content() office = office.text_content() phone = phone.text_content() leg = Legislator(term, chamber, district, name, party, phone=phone, office=office, url=link[0].get('href')) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): url = self.urls[term][chamber] if url is None: raise NoDataForPeriod(term) with self.urlopen(url) as page: page = lxml.html.fromstring(page) for row in page.xpath("//tr")[1:]: name = row.xpath("td")[0].text_content() name = name.split(",") if len(name) == 2: fullname = "%s %s" % (name[1].strip(), name[0].strip()) elif len(name) == 3: fullname = "%s %s, %s" % (name[1].strip(), name[0].strip(), name[2].strip()) else: fullname = " ".join(name).strip() # Most recent general assembly legislators list is slightly different than archived versions if term == "106th General Assembly": party = row.xpath("td")[1].text_content().strip() district = row.xpath("td")[3].text_content().replace("District ", "").strip() else: party, district = row.xpath("td")[1].text_content().split("-") party = party.strip() district = district.strip() leg = Legislator(term, chamber, district, fullname, party=party) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, year): found = False for session in metadata['sessions']: if session['name'] == year: found = True break if not found: raise NoDataForYear(year) if chamber == 'lower': title = 'Representative' else: title = 'Senator' url = 'http://www.le.state.ut.us/asp/roster/roster.asp?year=%s' % year leg_list = self.soup_parser(self.urlopen(url)) for row in leg_list.findAll('table')[1].findAll('tr')[1:]: tds = row.findAll('td') leg_title = tds[1].find(text=True) if leg_title == title: fullname = tds[0].find(text=True) last_name = fullname.split(',')[0] first_name = fullname.split(' ')[1] if len(fullname.split(' ')) > 2: middle_name = fullname.split(' ')[2] leg = Legislator(year, chamber, tds[3].find(text=True), fullname, first_name, last_name, middle_name, tds[2].find(text=True)) leg.add_source(url) self.save_legislator(leg)
def fetch_member(self, url, name, term, chamber): party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'} party_district_re = re.compile( r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)') url = 'http://leg6.state.va.us' + url # handle resignations, special elections match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name) if match: action, date = match.groups() name = name.rsplit('-')[0] if action == 'Resigned': pass # TODO: set end date elif action == 'Member': pass # TODO: set start date with self.urlopen(url) as html: doc = lxml.html.fromstring(html) party_district_line = doc.xpath('//h3/font/text()')[0] party, district = party_district_re.match(party_district_line).groups() leg = Legislator(term, chamber, district, name.strip(), party=party_map[party]) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term) if chamber == 'upper': url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=senate" else: url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=assembly" with self.urlopen(url) as body: page = lxml.html.fromstring(body) for row in page.cssselect("#ctl00_C_dgLegData tr"): if len(row.cssselect("td a")) > 0: rep_url = list(row)[0].cssselect("a[href]")[0].get("href") legpart = re.findall(r'([\w\-\,\s\.]+)\s+\(([\w])\)', list(row)[0].text_content()) if legpart: full_name, party = legpart[0] district = str(int(list(row)[2].text_content())) leg = Legislator(term, chamber, district, full_name, party=party) leg.add_source(rep_url) leg = self.add_committees(leg, rep_url, term) self.save_legislator(leg)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) chamber_name = {'upper': 'Senate', 'lower': 'House'}[chamber] url = ("http://www.in.gov/cgi-bin/legislative/listing/" "listing-2.pl?data=alpha&chamber=%s" % chamber_name) with self.urlopen(url) as page: page = lxml.html.fromstring(page) for link in page.xpath("//div[@id='col2']/p/a"): name = link.text.strip() details = link.getnext().text.strip() party = details.split(',')[0] if party == 'Democrat': party = 'Democratic' district = re.search(r'District (\d+)', details).group(1) district = district.lstrip('0') leg = Legislator(term, chamber, district, name, '', '', '', party) leg.add_source(url) self.save_legislator(leg)
def scrape_reps(self, chamber, term): # There are 99 House districts for district in xrange(1, 100): rep_url = ('http://www.house.state.oh.us/components/' 'com_displaymembers/page.php?district=%d' % district) with self.urlopen(rep_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) for el in root.xpath('//table[@class="page"]'): rep_link = el.xpath('tr/td/title')[0] full_name = rep_link.text party = full_name[-2] full_name = full_name[0:-3] if party == "D": party = "Democratic" elif party == "R": party = "Republican" leg = Legislator(term, chamber, str(district), full_name, '', '', '', party) leg.add_source(rep_url) self.save_legislator(leg)
def scrape_house(self, term): url = 'http://www.house.leg.state.mn.us/members/housemembers.asp' office_addr = ''' State Office Building, 100 Rev. Dr. Martin Luther King Jr. Blvd. Saint Paul, Minnesota 55155''' with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # skip first header row for row in doc.xpath('//tr')[1:]: tds = [td.text_content().strip() for td in row.xpath('td')] if len(tds) == 5: district = tds[0] name, party = tds[1].rsplit(' ', 1) if party == '(R)': party = 'Republican' elif party == '(DFL)': party = 'Democratic-Farmer-Labor' addr = tds[2] + office_addr phone = tds[3] email = tds[4] leg = Legislator(term, 'lower', district, name, party=party, office_address=addr, office_phone=phone, email=email) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): # Pennsylvania doesn't make member lists easily available # for previous sessions, unfortunately if term != '2009-2010': raise NoDataForPeriod(term) leg_list_url = legislators_url(chamber) with self.urlopen(leg_list_url) as page: page = lxml.html.fromstring(page) for link in page.xpath("//a[contains(@href, '_bio.cfm')]"): full_name = link.text[0:-4] district = re.search("District (\d+)", link.tail).group(1) party = link.text[-2] if party == 'R': party = 'Republican' elif party == 'D': party = 'Democratic' legislator = Legislator(term, chamber, district, full_name, party=party) legislator.add_source(leg_list_url) self.save_legislator(legislator)
def scrape_pre_58_legislators(self, chamber, term, suffix): url = 'http://leg.mt.gov/css/Sessions/%s%s/legname.asp' % (term, suffix) legislator_page = ElementTree(lxml.html.fromstring(self.urlopen(url))) if term == '57': if chamber == 'upper': tableName = '57th Legislatore Roster Senate (2001-2002)' startRow = 3 else: tableName = '57th Legislator Roster (House)(2001-2002)' startRow = 5 elif term == '56': if chamber == 'upper': tableName = 'Members of the Senate' startRow = 3 else: tableName = 'Members of the House' startRow = 5 for table in legislator_page.xpath("//table"): if table.attrib.has_key('name') and table.attrib['name'] == tableName: parse_names = False for row in table.getchildren(): if row.tag != 'tr': continue celldata = row.getchildren()[0].text_content().strip() if parse_names and len(celldata) != 0: name, party_letter = celldata.rsplit(' (', 1) party_letter = party_letter[0] nameParts = [namePart.strip() for namePart in name.split(',')] assert len(nameParts) < 4 if len(nameParts) == 2: last_name, first_name = nameParts elif len(nameParts) == 3: last_name = ' '.join(nameParts[0:2]) first_name = nameParts[2] else: name, party_letter = celldata.rsplit(' (', 1) district = row.getchildren()[2].text_content().strip() if party_letter == 'R': party = 'Republican' elif party_letter == 'D': party = 'Democrat' else: party = party_letter legislator = Legislator(term, chamber, district, '%s %s' % (first_name, last_name), \ first_name, last_name, '', party) legislator.add_source(url) self.save_legislator(legislator) if celldata == "Name (Party)": # The table headers seem to vary in size, but the last row # always seems to start with 'Name (Party)' -- once we find # that, start parsing legislator names parse_names = True
def scrape(self, chamber, term): mtype = {'upper':'senator', 'lower': 'representative'}[chamber] extra_fields = { 'phone': './phone-numbers/phone-number[@title="Capitol Phone"]/@number', 'district_phone': './phone-numbers/phone-number[@title="District Phone"]/@number' } addr_fields = { 'capitol_address': './addresses/address[@title="Capitol Address"]', 'district_address': './addresses/address[@title="District Office Address"]', } party_map = {'DEM': 'Democratic', 'REP': 'Republican'} with self.urlopen('http://www.leg.state.or.us/xml/members.xml') as html: doc = lxml.html.fromstring(html) for member in doc.xpath('//member[@member-type="%s"]' % mtype): first_name = member.get('first-name') last_name = member.get('last-name') party = party_map[member.get('party')] # extra_fields extra_dict = {} for name, xpath in extra_fields.iteritems(): result = member.xpath(xpath) if result: extra_dict[name] = result[0] # address fields for name, xpath in addr_fields.iteritems(): result = member.xpath(xpath) if result: extra_dict[name] = '%s %s, %s %s' % ( member.get('street-address'), member.get('city'), member.get('state'), member.get('postal-code')) leg = Legislator(term, chamber, member.get('district-number'), full_name=first_name+' '+last_name, first_name=first_name, last_name=last_name, middle_name=member.get('middle-initial'), party=party, email=member.get('e-mail'), website=member.get('website'), oregon_member_id=member.get('leg-member-id'), **extra_fields) leg.add_source('http://www.leg.state.or.us/xml/members.xml') self.save_legislator(leg)
def scrape(self, chamber, year): if year != '2009': raise NoDataForYear(year) term = "%s-%d" % (year, int(year) + 1) # What Vermont claims are Word and Excel files are actually # just HTML tables # What Vermont claims is a CSV file is actually one row of comma # separated values followed by a ColdFusion error. leg_url = "http://www.leg.state.vt.us/legdir/"\ "memberdata.cfm/memberdata.doc?FileType=W" leg_table = BeautifulSoup(self.urlopen(leg_url)) for tr in leg_table.findAll('tr')[1:]: leg_cham = tr.findAll('td')[3].contents[0] if leg_cham == 'H' and chamber == 'upper': continue if leg_cham == 'S' and chamber == 'lower': continue district = tr.findAll('td')[5].contents[0] district = district.replace(' District', '').strip() first = tr.findAll('td')[6].contents[0] middle = tr.findAll('td')[7] if len(middle.contents) == 0: middle = '' else: middle = middle.contents[0].strip() last = tr.findAll('td')[8].contents[0] if len(middle) == 0: full = "%s, %s" % (last, first) else: full = "%s, %s %s." % (last, first, middle) official_email = tr.findAll('td')[9] if len(official_email.contents) == 0: official_email = '' else: official_email = official_email.contents[0] party = tr.findAll('td')[4].contents[0] if party == 'D': party = 'Democrat' elif party == 'R': party = 'Republican' elif party == 'I': party = 'Independent' elif party == 'P': party = 'Progressive' leg = Legislator(term, chamber, district, full, first, last, middle, party, official_email=official_email) leg.add_source(leg_url) self.save_legislator(leg)
def scrape(self, chamber, year): if int(year) != 2009: return session = "%s-%d" % (year, int(year) + 1) # What Vermont claims are Word and Excel files are actually # just HTML tables # What Vermont claims is a CSV file is actually one row of comma # separated values followed by a ColdFusion error. leg_url = "http://www.leg.state.vt.us/legdir/" "memberdata.cfm/memberdata.doc?FileType=W" leg_table = BeautifulSoup(self.urlopen(leg_url)) for tr in leg_table.findAll("tr")[1:]: leg_cham = tr.findAll("td")[3].contents[0] if leg_cham == "H" and chamber == "upper": continue if leg_cham == "S" and chamber == "lower": continue district = tr.findAll("td")[5].contents[0] district = district.replace(" District", "").strip() first = tr.findAll("td")[6].contents[0] middle = tr.findAll("td")[7] if len(middle.contents) == 0: middle = "" else: middle = middle.contents[0].strip() last = tr.findAll("td")[8].contents[0] if len(middle) == 0: full = "%s, %s" % (last, first) else: full = "%s, %s %s." % (last, first, middle) official_email = tr.findAll("td")[9] if len(official_email.contents) == 0: official_email = "" else: official_email = official_email.contents[0] party = tr.findAll("td")[4].contents[0] if party == "D": party = "Democrat" elif party == "R": party = "Republican" elif party == "I": party = "Independent" elif party == "P": party = "Progressive" leg = Legislator( session, chamber, district, full, first, last, middle, party, official_email=official_email ) leg.add_source(leg_url) self.save_legislator(leg)