def scrape_member(self, chamber, year, member_url): member_page = self.urlopen(member_url) doc = lxml.html.fromstring(member_page) photo_url = doc.xpath('//div[@id="bioImage"]/img/@src')[0] name_pieces = doc.xpath('//span[@id="name"]/text()')[0].split() full_name = ' '.join(name_pieces[1:-1]).strip() party = name_pieces[-1] if party == '(R)': party = 'Republican' elif party == '(D)': party = 'Democratic' elif party == '(I)': party = 'Independent' district = doc.xpath('//span[@id="districtHeader"]/text()')[0].split()[-1] leg = Legislator(year, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url) leg.add_source(member_url) address = '\n'.join(doc.xpath('//div[@id="FrankfortAddresses"]//span[@class="bioText"]/text()')) phone = None phone_numbers = doc.xpath('//div[@id="PhoneNumbers"]//span[@class="bioText"]/text()') for num in phone_numbers: if num.startswith('Annex: '): phone = num.replace('Annex: ', '') leg.add_office('capitol', 'Capitol Office', address=address, phone=phone) self.save_legislator(leg)
def scrape_senators(self, chamber, term): url = 'http://www.ohiosenate.gov/directory.html' with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for el in page.xpath('//table[@class="fullWidth"]/tr/td'): sen_link = el.xpath('a[@class="senatorLN"]')[1] sen_url = sen_link.get('href') full_name = sen_link.text full_name = full_name[0:-2] if full_name == 'To Be Announced': continue district = el.xpath('string(h3)').split()[1] party = el.xpath('string(a[@class="senatorLN"]/span)') if party == "D": party = "Democratic" elif party == "R": party = "Republican" office_phone = el.xpath("b[text() = 'Phone']")[0].tail office_phone = office_phone.strip(' :') office = ", ".join([x.strip() for x in \ el.xpath("./text()")[2:-1]]) photo_url = el.xpath("a/img")[0].attrib['src'] email = el.xpath('.//span[@class="tan"]/text()')[1] leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=sen_url, email="") committees = self.scrape_senate_committees(sen_url) leg.add_office('capitol', 'Capitol Office', address=office, phone=office_phone) leg.add_source(url) leg.add_source(sen_url) for committee in committees: chmbr = chamber if "joint" in committee['committee'].lower(): chmbr = "joint" leg.add_role('committee member', term=term, chamber=chmbr, committee=committee['committee'], position=committee['title'] ) self.save_legislator(leg)
def scrape_rep_info(self, url, term): district_to_sponsor_id = self.get_sponsor_ids() #get reps html = self.get(url).text page = lxml.html.fromstring(html) reps = page.xpath("//table[contains(@id,'HseMainContent_tabByName_TabPanel')]//tr") for rep in reps: #get basic rep info info = rep.xpath(".//td") if len(info) == 0: continue rep_name,party,district,suite,phone = [i.text_content() for i in info] district = district.replace("House District","").strip() office_address = '{}\n11 S. Union Street\nMontgomery, AL 36130'.format(suite) assert rep_name.count(",") == 1, "Unable to parse representative's name: {}".format(rep_name) full_name_parts = [x.strip() for x in rep_name.split(",")] full_name = "{0} {1}".format(full_name_parts[1], full_name_parts[0]) PARTIES = { 'R': "Republican", 'D': "Democratic" } party = PARTIES[party.strip()] #add basic leg info and main office leg = Legislator(term, "lower", district, full_name, party=party) leg.add_office('capitol', 'Capitol Office', address=office_address, phone=phone.strip()) #match rep to sponsor_id if possible ln,fn = rep_name.split(",") last_fi_key = "{ln} ({fi})".format(ln=ln.strip(), fi=fn.strip()[0]) leg.add_source(url) try: sponsor_id = district_to_sponsor_id[district] except KeyError: #can't find rep's sponsor_id, do what we can and get out! self.logger.warning("Legislator {name} does not match any sponsor_id and thus will not be linked to bills or committees".format(name=rep_name)) self.save_legislator(leg) continue #scrape rep's additional info from sponsor page rep_sponsor_url = "http://www.legislature.state.al.us/aliswww/Representative.aspx?OID_SPONSOR={}".format(sponsor_id) rep_html = self.get(rep_sponsor_url).text rep_page = lxml.html.fromstring(rep_html) leg["photo_url"] = rep_page.xpath("//input[contains(@id,'imgLEG')]/@src")[0] self.add_committees(rep_page,leg,"lower",term) leg.add_source(rep_sponsor_url) self.save_legislator(leg)
def get_member(self, term, chamber, kpid): url = '%smembers/%s' % (ksapi.url, kpid) content = json.loads(self.get(url).text)['content'] party = content['PARTY'] if party == 'Democrat': party = 'Democratic' slug = {'2013-2014': 'b2013_14', '2015-2016': 'b2015_16'}[term] leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid) legislator_page = self.lxmlize(leg_url) (photo_url, ) = legislator_page.xpath( '//img[@class="profile-picture"]/@src') legislator = Legislator(term, chamber, str(content['DISTRICT']), content['FULLNAME'], email=content['EMAIL'], party=party, url=leg_url, photo_url=photo_url, occupation=content['OCCUPATION'], ) address = ('Room %s\n' 'Kansas State Capitol Building\n' '300 SW 10th St.\n' 'Topeka, KS 66612') % content['OFFICENUM'] legislator.add_office('capitol', 'Capitol Office', phone=content['OFFPH'] or None, address=address) legislator.add_source(url) self.save_legislator(legislator)
def scrape(self, chamber, session): url = self.get_district_list(chamber, session) people_pages = self.scrape_directory(url, chamber, session) for person in people_pages: district = person p_url = people_pages[district] metainf = self.process_person(p_url) p = Legislator( session, chamber, district, metainf["name"], party=metainf["party"], # some additional things the website provides: occupation=metainf["occupation"], photo_url=metainf["photo_url"], url=metainf["homepage"], ) if "email" in metainf: p["email"] = metainf["email"] if "number" in metainf: p.add_office( "capitol", "Capitol Office", phone=metainf["number"], address="200 E. Colfax\nDenver, CO 80203" ) p.add_source(p_url) self.save_legislator(p)
def scrape_senators(self, chamber, session, term): url = self.senator_url % (session[2:]) root_url = url page = self.urlopen(url) page = lxml.html.fromstring(page) table = page.xpath('//*[@id="mainContent"]/table//table/tr') rowcount = 0 for tr in table: rowcount += 1 # the first two rows are headers, skip: if rowcount < 2: continue tds = tr.xpath('td') full_name = tds[0].xpath('div/a')[0].text_content().strip() party_and_district = tds[1].xpath('div')[0].text_content().strip().split('-') if party_and_district[0] == 'D': party = 'Democratic' elif party_and_district[0] == 'R': party = 'Republican' senator_key = "%s%s" % (party_and_district[0].lower(),party_and_district[1]) district = party_and_district[1] phone = tds[3].xpath('div')[0].text_content().strip() url = self.senator_details_url % (session[2:],int(district)) leg = Legislator(term, chamber, district, full_name, party=party, url=url) leg.add_source(root_url) details_page = self.urlopen(url) leg.add_source(url) homepage = url page = lxml.html.fromstring(details_page) photo_url = page.xpath("//div[@id='container']/div[1]/img") photo_url = photo_url[0].attrib['src'] url = self.senator_address_url % ( session[2:],int(senator_key[1:])) details_page = self.urlopen(url) leg.add_source(url) page = lxml.html.fromstring(details_page) address = page.xpath('/html/body//span[2]')[0].text_content().split('\n') email = page.xpath('/html/body/p/span[2]/a/@href') # TODO This is only true if the href doesn't contain 'mail_form'. If it does, # then there is only a webform. So...no email? # TODO a lot of these have fax numbers. Include? kwargs = { "address": "%s%s" % (address[0],address[1]) } if phone.strip() != "": kwargs['phone'] = phone leg.add_office("capitol", "Capitol Office", **kwargs) leg['photo_url'] = photo_url if email and len(email) > 0 and email[0] != 'mailto:': leg['email'] = email[0].split(':')[1] #print "em = %s" % email self.save_legislator(leg)
def scrape(self, chamber, session): url = self.get_district_list(chamber, session) people_pages = self.scrape_directory( url, chamber, session ) for person in people_pages: district = person p_url = people_pages[district] metainf = self.process_person( p_url ) p = Legislator( session, chamber, district, metainf['name'], party=metainf['party'], # some additional things the website provides: occupation=metainf['occupation'], photo_url=metainf['photo_url'], url=metainf['homepage']) if "email" in metainf: p['email'] = metainf['email'] if "number" in metainf: p.add_office('capitol', 'Capitol Office', phone=metainf['number'], address='200 E. Colfax\nDenver, CO 80203' ) p.add_source( p_url ) if 'ctty' in metainf: for ctty in metainf['ctty']: p.add_role( 'committee member', term=session, chamber=chamber, committee=clean_committee(ctty), position="member" ) self.save_legislator( p )
def get_member(self, term, chamber, kpid): url = '%smembers/%s' % (ksapi.url, kpid) content = json.loads(self.urlopen(url))['content'] party = content['PARTY'] if party == 'Democrat': party = 'Democratic' slug = {'2013-2014': 'b2013_14'}[term] leg_url = '%s/%s/members/%s/' % (LI, slug, kpid) photo_url = '%s/m/images/pics/%s.jpg' % (LI, kpid) legislator = Legislator(term, chamber, str(content['DISTRICT']), content['FULLNAME'], email=content['EMAIL'], party=party, url=leg_url, photo_url=photo_url, occupation=content['OCCUPATION']) # just do office address for now, can get others from api if content['OFFICENUM']: address = ('Kansas House of Representatives\n' 'Docking State Office Building\n' '901 SW Harrison Street\n' 'Topeka, KS 66612') else: address = ('Room %s\n' 'Kansas State Capitol Building\n' '300 SW 10th St.\n' 'Topeka, KS 66612') % content['OFFICENUM'] legislator.add_office('capitol', 'Capitol Office', phone=content['OFFPH'] or None, address=address) legislator.add_source(url) self.save_legislator(legislator)
def scrape(self, term, chambers): represent_url = 'http://represent.opennorth.ca/representatives/%s/?limit=500' % self.representative_set data = json.load(urllib2.urlopen(represent_url)) for rep in data['objects']: leg = Legislator(term, 'lower', rep['district_name'], rep['name'], party=rep.get('party_name'), photo_url=rep.get('photo_url'), url=rep.get('url'), email=rep.get('email') ) leg.add_source(rep['source_url']) for rep_office in rep.get('offices', []): name = rep_office.get('postal', '').split('\n')[0] if not name: name = (rep_office.get('type', '').title() + ' office').strip() leg.add_office( 'capitol' if rep_office.get('type') == 'legislature' else 'district', name, phone=rep_office.get('tel'), fax=rep_office.get('fax'), address=rep_office.get('postal') ) self.save_legislator(leg)
def scrape_upper(self, term): url = 'http://www.utahsenate.org/aspx/roster.aspx' html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath('//tr')[1:]: tds = row.xpath('td') # 1st has district district = tds[0].text_content() # 3rd has name and email person = tds[2].xpath('span[@class="person"]')[0] if '(D)' in person.text_content(): party = 'Democratic' elif '(R)' in person.text_content(): party = 'Republican' else: raise ValueError('unknown party') a = person.xpath('a')[0] name = a.text_content() leg_url = a.get('href') email = tds[2].xpath('span[@class="email"]/a/text()') if email: email = email[0] else: email = '' # office address # text is split by br in 4th td, join with a space address = ' '.join(tds[3].xpath('font/text()')) numbers = tds[4].xpath('text()') phone = None fax = None for num in numbers: if num.startswith(('Cell', 'Home', 'Work')) and not phone: phone = num.split(u'\xa0')[-1] elif num.startswith('Fax'): fax = num.split(u'\xa0')[-1] numbers = [num.split(u'\xa0') for num in numbers] # get photo try: leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) photo_url = leg_doc.xpath('//p[@class="photo"]/img/@src')[0] except: self.warning('could not fetch %s' % leg_url) photo_url = '' leg = Legislator(term, 'upper', district, name, party=party, email=email, address=address, photo_url=photo_url, url=leg_url) leg.add_office('district', 'Home', address=address, phone=phone, fax=fax) leg.add_source(url) leg.add_source(leg_url) self.save_legislator(leg)
def scrape_senate(self, term): urls = ( 'http://www.senadopr.us/senadores/Pages/Senadores%20Acumulacion.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20I.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20II.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20III.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20IV.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20V.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VI.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VII.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VIII.aspx') for counter, url in enumerate(urls): leg_page_html = self.urlopen(url) doc = lxml.html.fromstring(leg_page_html) doc.make_links_absolute(url) table = doc.xpath('//table[@summary="Listado de Senadores"]')[0] # skip first row for row in table.xpath('tr')[1:]: tds = row.xpath('td') name = tds[0].text_content().title().replace('Hon.','',1).strip() party = tds[1].text_content() phone = tds[2].text_content() email = tds[3].text_content() #shapefiles denote 0 as At-Large Districts if counter == 0: district = 'At-Large' else: district = str(counter) #Code to guess the picture namefixed = unicode(name.replace(".",". ")) #Those middle names abbreviations are sometimes weird. namefixed = unicodedata.normalize('NFKD', namefixed).encode('ascii', 'ignore') #Remove the accents nameparts = namefixed.split() if nameparts[1].endswith('.'): lastname = nameparts[2] else: lastname = nameparts[1] # Construct the photo url picture_filename = 'http://www.senadopr.us/Fotos%20Senadores/sen_' + (nameparts[0][0] + lastname).lower() + '.jpg' try: picture_data = self.urlopen(picture_filename): # Checking to see if the file is there leg = Legislator(term, 'upper', district, name, party=party, email=email, url=url, photo_url=picture_filename) except scrapelib.HTTPError: # If not, leave out the photo_url leg = Legislator(term, 'upper', district, name, party=party, phone=phone, email=email, url=url) leg.add_office('capitol', 'Oficina del Capitolio', phone=phone) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): chamber_abbrev = {'upper': 'S', 'lower': 'H'}[chamber] url = ("http://legisweb.state.wy.us/LegislatorSummary/LegislatorList" ".aspx?strHouse=%s&strStatus=N" % chamber_abbrev) page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) for link in page.xpath("//a[contains(@href, 'LegDetail')]"): name = link.text.strip() leg_url = link.get('href') email_address = link.xpath("../../../td[1]//a")[0].attrib['href'] email_address = link.xpath("../../../td[2]//a")[0].attrib['href'] email_address = email_address.split('Mailto:')[1] party = link.xpath("string(../../../td[3])").strip() if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' district = link.xpath( "string(../../../td[4])").strip().lstrip('HS0') leg_page = lxml.html.fromstring(self.urlopen(leg_url)) leg_page.make_links_absolute(leg_url) img = leg_page.xpath( "//img[contains(@src, 'LegislatorSummary/photos')]")[0] photo_url = img.attrib['src'] office_tds = leg_page.xpath( '//table[@id="ctl00_cphContent_tblContact"]/tr/td/text()') address = [] phone = None fax = None for td in office_tds: if td.startswith('Home -'): phone = td.strip('Home - ') # only use cell if home isn't present elif td.startswith('Cell -') and not phone: phone = td.strip('Cell - ') elif td.startswith('Fax -'): fax = td.strip('Fax - ') else: address.append(td) leg = Legislator(term, chamber, district, name, party=party, email=email_address, photo_url=photo_url, url=leg_url) adr = " ".join(address) if adr.strip() != "": leg.add_office('district', 'Contact Information', address=adr, phone=phone, fax=fax) leg.add_source(url) leg.add_source(leg_url) self.save_legislator(leg)
def scrape(self, chamber, session): metainf = self.scrape_leg_page(get_legislator_listing_url(chamber)) for leg in metainf: chamber = {"House": "lower", "Senate": "upper"}[leg['chamber']] p = Legislator( session, chamber, leg['district'], leg['name'], party=leg['party'], # some additional things the website provides: photo_url=leg['image'], url=leg['homepage'], email=leg['email']) p.add_office('capitol', 'Capitol Office', address=leg['addr'], phone=leg['phone'], fax=leg['fax'] or None) for source in leg['source']: p.add_source( source ) try: for ctty in leg['ctty']: flag='Joint Legislative' if ctty['name'][:len(flag)] == flag: ctty_chamber = "joint" else: ctty_chamber = chamber p.add_role( 'committee member', term=session, chamber=ctty_chamber, committee=ctty['name'], position="member") except KeyError: self.log( "XXX: Warning, %s has no scraped Commities" % leg['name'] ) self.save_legislator( p )
def scrape_details(self, chamber, term, leg_name, leg_link, role): if not leg_link: # Vacant post, likely: if "Vacancy" in leg_name: return raise Exception("leg_link is null. something went wrong") try: url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link url_root = os.path.dirname(url) details_page = self.urlopen(url) root = lxml.etree.fromstring(details_page.bytes) party = root.xpath('string(//PARTY)') district = root.xpath('string(//DISTRICT)') photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)')) home_phone = root.xpath('string(//H_PHONE)') bis_phone = root.xpath('string(//B_PHONE)') capital_phone = root.xpath('string(//CAP_PHONE)') other_phone = root.xpath('string(//OTH_PHONE)') org_info = root.xpath('string(//ORG_INFO)') email_name = root.xpath('string(//EMAIL_ADDRESS)') cap_room = root.xpath('string(//CAP_ROOM)') if party == 'D': party = 'Democratic' else: party = 'Republican' leg = Legislator(term, chamber, district, leg_name, party=party, role=role, org_info=org_info, url=url, photo_url=photo) leg.add_source(url) kwargs = {} if email_name.strip() != "": email = '%s@%s.ms.gov' % (email_name, { "upper": "senate", "lower": "house" }[chamber]) kwargs['email'] = email if capital_phone != "": kwargs['phone'] = capital_phone if cap_room != "": kwargs["address"] = "Room %s\n%s" % (cap_room, CAP_ADDRESS) else: kwargs['address'] = CAP_ADDRESS leg.add_office('capitol', 'Capitol Office', **kwargs) self.save_legislator(leg) except scrapelib.HTTPError, e: self.warning(str(e))
def scrape(self, chamber, session): url = self.get_district_list(chamber, session) people_pages = self.scrape_directory( url, chamber, session ) for person in people_pages: district = person p_url = people_pages[district] metainf = self.process_person( p_url ) p = Legislator( session, chamber, district, metainf['name'], party=metainf['party'], # some additional things the website provides: occupation=metainf['occupation'], photo_url=metainf['photo_url'], url=metainf['homepage']) phone = metainf['number'] if 'number' in metainf else None email = metainf['email'] if 'email' in metainf else None p.add_office('capitol', 'Capitol Office', phone=phone, address='200 E. Colfax\nDenver, CO 80203', email=email ) p.add_source( p_url ) self.save_legislator( p )
def scrape_upper(self, chamber, term): url = 'http://www.senate.michigan.gov/members/memberlist.htm' with self.urlopen(url) as html: doc = lxml.html.fromstring(html) for row in doc.xpath('//table[@width=550]/tr')[1:39]: # party, dist, member, office_phone, office_fax, office_loc party = abbr[row.xpath('td[1]/text()')[0]] district = row.xpath('td[2]/a/text()')[0] leg_url = row.xpath('td[3]/a/@href')[0] name = row.xpath('td[3]/a/text()')[0] office_phone = row.xpath('td[4]/text()')[0] office_fax = row.xpath('td[5]/text()')[0] office_loc = row.xpath('td[6]/text()')[0] leg = Legislator(term=term, chamber=chamber, district=district, full_name=name, party=party, url=leg_url) leg.add_office('capitol', 'Capitol Office', address=office_loc, fax=office_fax, phone=office_phone) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): if chamber == 'upper': url = ('http://webserver.rilin.state.ri.us/Documents/Senators.xls') rep_type = 'Senator ' elif chamber == 'lower': url = ( 'http://webserver.rilin.state.ri.us/Documents/Representatives.xls') rep_type = 'Representative ' self.urlretrieve(url, 'ri_leg.xls') wb = xlrd.open_workbook('ri_leg.xls') sh = wb.sheet_by_index(0) for rownum in xrange(1, sh.nrows): d = {} for field, col_num in excel_mapping.iteritems(): d[field] = sh.cell(rownum, col_num).value dist = str(int(d['district'])) district_name = dist full_name = re.sub(rep_type, '', d['full_name']).strip() translate = { "Democrat" : "Democratic", "Republican" : "Republican", "Independent" : "Independent" } leg = Legislator(term, chamber, district_name, full_name, '', '', '', translate[d['party']], town_represented=d['town_represented'], email=d['email']) leg.add_office('district', 'Address', address=d['address']) leg.add_source(url) self.save_legislator(leg)
def test_legislator(): l = Legislator('T1', 'upper', '1', 'Adam Smith', 'Adam', 'Smith') assert_equal(l, {'_type': 'person', 'full_name': 'Adam Smith', 'first_name': 'Adam', 'last_name': 'Smith', 'middle_name': '', 'suffixes': '', 'roles': [ {'chamber': 'upper', 'term': 'T1', 'role': 'member', 'start_date': None, 'end_date': None, 'district': '1', 'party': ''}], 'offices': [], 'sources': []}) l.add_role('committee member', 'T1', committee='Some Committee', position='chairman') assert_equal(l['roles'][1], {'role': 'committee member', 'term': 'T1', 'start_date': None, 'end_date': None, 'committee': 'Some Committee', 'position': 'chairman'}) l.add_office('capitol', 'Statehouse Office', '123 Main St', '123-456-7890', '123-555-5555', '*****@*****.**') assert_equal(l['offices'], [{'type': 'capitol', 'name': 'Statehouse Office', 'address': '123 Main St', 'phone': '123-456-7890', 'fax': '123-555-5555', 'email': '*****@*****.**'}])
def table_row_to_legislator_and_profile_url(table_row_element, chamber, term): """Derive a Legislator from an HTML table row lxml Element, and a link to their profile""" td_elements = table_row_element.xpath('td') (role_element, name_element, district_element, party_element, phone_element, email_element) = td_elements # Name comes in the form Last, First #last_name_first_name = name_element.text_content().strip() #full_name = last_name_first_name_to_full_name(last_name_first_name) full_name = name_element.text_content().strip() district = district_element.text_content().strip() party = party_element.text_content().strip() if party == 'Democrat': party = 'Democratic' legislator = Legislator(term, chamber, district, full_name, party=party) role = role_element.text_content().strip() address = co_address_from_role(role) phone = phone_element.text_content().strip() email = email_element.text_content().strip() legislator.add_office( 'capitol', 'Capitol Office', address=address, phone=phone, email=email, ) (profile_url, ) = name_element.xpath('a/@href') return legislator, profile_url
def scrape_reps(self, chamber, session, term): url = (self.reps_url % (session)) page = self.urlopen(url) page = lxml.html.fromstring(page) # This is the ASP.net table container table_xpath = ('id("ContentPlaceHolder1_' 'gridMembers_DXMainTable")') table = page.xpath(table_xpath)[0] for tr in table.xpath('tr')[1:]: tds = tr.xpath('td') leg_code = tds[0].xpath('a[1]')[0].attrib.get('href') last_name = tds[0].text_content().strip() first_name = tds[1].text_content().strip() full_name = '%s %s' % (first_name, last_name) district = str(int(tds[2].text_content().strip())) party = tds[3].text_content().strip() if party == 'Democrat': party = 'Democratic' phone = tds[4].text_content().strip() room = tds[5].text_content().strip() address = self.assumed_address_fmt % (room if room else '') if last_name == 'Vacant': leg = Legislator(term, chamber, district, full_name=full_name, first_name=first_name, last_name=last_name, party=party, _code=leg_code, url=url) leg.add_office('capitol', "Capitol Office", address=address, phone=phone) leg.add_source(url) self.save_vacant_legislator(leg) else: leg = Legislator(term, chamber, district, full_name=full_name, first_name=first_name, last_name=last_name, party=party, _code=leg_code, url=url) leg.add_office('capitol', 'Capitol Office', address=address, phone=phone) url = (self.rep_details_url % (session,district)) leg.add_source(url) details_page = self.urlopen(url) page = lxml.html.fromstring(details_page) picture = page.xpath('//*[@id="ContentPlaceHolder1_imgPhoto"]/@src') email = page.xpath('//*[@id="ContentPlaceHolder1_lblAddresses"]/table/tr[4]/td/a/@href') terms = page.xpath('//*[@id="ContentPlaceHolder1_lblElected"]') committees = page.xpath('//*[@id="ContentPlaceHolder1_lblCommittees"]/li/a') for c in committees: leg.add_role('committee member', term, committee=c.text_content().strip(), chamber=chamber) # TODO home address? if len(email) > 0 and email[0] != 'mailto:': #print "Found email : %s" % email[0] leg['email'] = email[0].split(':')[1] if len(picture) > 0: #print "Found picture : %s" % picture[0] leg['photo_url'] = picture[0] #leg.add_source(url) self.save_legislator(leg)
def scrape(self, term, chambers): base_url = 'http://news.legislature.ne.gov/dist' #there are 49 districts for district in range(1, 50): if district < 10: rep_url = base_url + '0' + str(district) + '/biography/' else: rep_url = base_url + str(district) + '/biography/' try: html = self.urlopen(rep_url) page = lxml.html.fromstring(html) full_name = page.xpath('//div[@class="content_header_right"]/a')[0].text.split(' ',1)[1].strip() # This is hacky, are lis always the same? address = page.xpath('//div[@id="sidebar"]/ul[1]/li[3]')[0].text.strip() + '\n' address += page.xpath('//div[@id="sidebar"]/ul[1]/li[4]')[0].text.strip() + '\n' address += page.xpath('//div[@id="sidebar"]/ul[1]/li[5]')[0].text.strip() phone = page.xpath('//div[@id="sidebar"]/ul[1]/li[6]')[0].text.split() phone = phone[1] + '-' + phone[2] email = page.xpath('//div[@id="sidebar"]/ul[1]/li[7]/a')[0].text or '' #Nebraska is offically nonpartisan party = 'Nonpartisan' leg = Legislator(term, 'upper', str(district), full_name, party=party, email=email, url=rep_url) leg.add_source(rep_url) leg.add_office('capitol', 'Capitol Office', address=address, phone=phone) self.save_legislator(leg) except scrapelib.HTTPError: self.warning('could not retrieve %s' % rep_url)
def scrape_member(self, chamber, term, member_url): page = self.urlopen(member_url) root = lxml.html.fromstring(page) root.make_links_absolute(member_url) photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0] photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0] full_name = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@alt')[0] email = root.xpath('//a[contains(@href, "mailto")]/@href')[0] email = email.replace("mailto:", "") # if full_name == 'Frank A. Moran': district = root.xpath('//div[@id="District"]//div[starts-with(@class,"widgetContent")]') if len(district): district = district[0].text_content().strip() district = clean_district(district) else: self.logger.warning("No district tab found for this hot garbage. Skipping.") return party = root.xpath('//span[@class="legislatorAffiliation"]/text()')[0] if party == "D": party = "Democratic" elif party == "R": party = "Republican" else: party = "Other" leg = Legislator( term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url, email=email ) leg.add_source(member_url) # offices for dl in root.xpath('//dl[@class="address"]'): office_name = phone = fax = email = None address = [] for child in dl.getchildren(): text = child.text_content() if child.tag == "dt": office_name = text else: if text.startswith("Phone:"): phone = text.strip("Phone: ") or None elif text.startswith("Fax:"): fax = text.strip("Fax: ") or None elif text.startswith("Email:"): email = text.strip("Email: ") or None else: address.append(text) # all pieces collected if "District" in office_name: otype = "district" else: otype = "capitol" leg.add_office(otype, office_name, phone=phone, fax=fax, address="\n".join(address), email=email) self.save_legislator(leg)
def scrape_upper(self, chamber, term): url = 'http://www.senate.michigan.gov/members/memberlist.htm' html = self.urlopen(url) doc = lxml.html.fromstring(html) for row in doc.xpath('//table[@width=550]/tr')[1:39]: # party, dist, member, office_phone, office_fax, office_loc party, dist, member, phone, fax, loc = row.getchildren() party = abbr[party.text] district = dist.text_content().strip() name = member.text_content().strip() if name == 'Vacant': self.info('district %s is vacant', district) continue leg_url = member.xpath('a/@href')[0] office_phone = phone.text office_fax = fax.text office_loc = loc.text leg = Legislator(term=term, chamber=chamber, district=district, full_name=name, party=party, url=leg_url) leg.add_office('capitol', 'Capitol Office', address=office_loc, fax=office_fax, phone=office_phone) leg.add_source(url) self.save_legislator(leg)
def scrape(self, term, chambers): url = 'http://gencourt.state.nh.us/downloads/Members(Asterisk%20Delimited).txt' option_map = {} html = self.urlopen('http://www.gencourt.state.nh.us/house/members/memberlookup.aspx') doc = lxml.html.fromstring(html) for opt in doc.xpath('//option'): option_map[opt.text] = opt.get('value') with self.urlopen(url) as data: for line in data.splitlines(): if line.strip() == "": continue (chamber, fullname, last, first, middle, county, district_num, seat, party, street, street2, city, astate, zipcode, home_phone, office_phone, fax, email, com1, com2, com3, com4, com5, _, _) = line.split('*') chamber = chamber_map[chamber] # skip legislators from a chamber we aren't scraping if chamber not in chambers: continue if middle: full = '%s %s %s' % (first, middle, last) else: full = '%s %s' % (first, last) address = street if street2: address += (' ' + street2) address += '\n%s, %s %s' % (city, astate, zipcode) district = str(int(district_num)) if county: district = '%s %s' % (county, district) leg = Legislator(term, chamber, district, full, first, last, middle, party_map[party], email=email) leg.add_office('district', 'Home Address', address=address, phone=home_phone or None) leg.add_office('district', 'Office Address', phone=office_phone or None, fax=fax or None) if chamber == 'upper': leg['url'] = 'http://www.gencourt.state.nh.us/Senate/members/webpages/district%02d.aspx' % int(district_num) elif chamber == 'lower': code = option_map.get('{0}, {1}'.format(last, first)) if code: leg['url'] = 'http://www.gencourt.state.nh.us/house/members/member.aspx?member=' + code for com in (com1, com2, com3, com4, com5): if com: leg.add_role('committee member', term=term, chamber=chamber, committee=com) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): if chamber == 'upper': url = "http://legis.wisconsin.gov/Pages/leg-list.aspx?h=s" else: url = "http://legis.wisconsin.gov/Pages/leg-list.aspx?h=a" body = self.urlopen(url) page = lxml.html.fromstring(body) page.make_links_absolute(url) for row in page.xpath("//table[@class='legis-list']/tr")[1:]: if row.xpath(".//a/@href"): rep_url = row.xpath(".//a/@href")[0] rep_doc = lxml.html.fromstring(self.urlopen(rep_url)) rep_doc.make_links_absolute(rep_url) first_name = rep_doc.xpath('//h2[@class="given-name"]/text()')[0] last_name = rep_doc.xpath('//h2[@class="family-name"]/text()')[0] full_name = '%s %s' % (first_name, last_name) party = rep_doc.xpath('//div[@class="party"]/text()')[0] if party == 'Democrat': party = 'Democratic' district = str(int(row.getchildren()[2].text_content())) # email email = rep_doc.xpath('//a[starts-with(@href, "mailto")]/text()') if email: email = email[0] else: email = '' leg = Legislator(term, chamber, district, full_name, first_name=first_name, last_name=last_name, party=party, url=rep_url, email=email) img = rep_doc.xpath('//img[@class="photo"]/@src') if img: leg['photo_url'] = img[0] # office #### address = '\n'.join(rep_doc.xpath('//dt[text()="Madison Office"]/following-sibling::dd/div/text()')) phone = rep_doc.xpath('//dt[text()="Telephone"]/following-sibling::dd/div/text()') if phone: phone = re.sub('\s+', ' ', phone[0]).strip() else: phone = None fax = rep_doc.xpath('//dt[text()="Fax"]/following-sibling::dd/div/text()') if fax: fax = re.sub('\s+', ' ', fax[0]).strip() else: fax = None leg.add_office('capitol', 'Madison Office', address=address, phone=phone, fax=fax) # save legislator leg.add_source(rep_url) self.save_legislator(leg)
def scrape(self, term, chambers): year_abr = term[0:4] file_url, db = self.get_dbf(year_abr, 'ROSTER') bio_url, bio_db = self.get_dbf(year_abr, 'LEGBIO') photos = {} for rec in bio_db: photos[rec['roster_key']] = rec['urlpicture'] for rec in db: first_name = rec["firstname"] middle_name = rec["midname"] last_name = rec["lastname"] suffix = rec["suffix"] full_name = first_name + " " + middle_name + " " + last_name + " " + suffix full_name = full_name.replace(' ', ' ') full_name = full_name[0: len(full_name) - 1] district = int(rec["district"]) party = rec["party"] if party == 'R': party = "Republican" elif party == 'D': party = "Democratic" else: party = party chamber = rec["house"] if chamber == 'A': chamber = "lower" elif chamber == 'S': chamber = "upper" title = rec["title"] legal_position = rec["legpos"] leg_status = rec["legstatus"] address = rec["address"] city = rec["city"] state = rec["state"] zipcode = rec["zipcode"] phone = rec["phone"] if 'email' in rec: email = rec["email"] else: email = '' photo_url = photos[rec['roster_key']] address = '{0}\n{1}, {2} {3}'.format(rec['address'], rec['city'], rec['state'], rec['zipcode']) leg = Legislator(term, chamber, str(district), full_name, first_name, last_name, middle_name, party, suffixes=suffix, title=title, legal_position=legal_position, leg_status=leg_status, email=email, photo_url=photo_url) leg.add_source(file_url) leg.add_office('district', 'District Office', address=address, phone=rec['phone']) self.save_legislator(leg)
def scrape(self, term, chambers): leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" data = self.get(leg_url) page = open_csv(data) for row in page: chamber = {'H': 'lower', 'S': 'upper'}[row['office code']] if chamber not in chambers: continue district = row['dist'].lstrip('0') name = row['first name'] mid = row['middle initial'].strip() if mid: name += " %s" % mid name += " %s" % row['last name'] suffix = row['suffix'].strip() if suffix: name += " %s" % suffix party = row['party'] if party == 'Democrat': party = 'Democratic' leg = Legislator(term, chamber, district, name, first_name=row['first name'], last_name=row['last name'], middle_name=row['middle initial'], suffixes=row['suffix'], party=party, email=row['email'].strip(), url=row['URL'], office_phone=row['capitol phone']) office_address = "%s, Room %s\nHartford, CT 06106-1591" % ( row['capitol street address'], row['room number']) leg.add_office('capitol', 'Capitol Office', address=office_address, phone=row['capitol phone']) # skipping home address for now leg.add_source(leg_url) for comm in row['committee member1'].split(';'): if comm: if ' (' in comm: comm, role = comm.split(' (') role = role.strip(')').lower() else: role = 'member' comm = comm.strip() if comm == '': continue leg.add_role('committee member', term, chamber='joint', committee=comm, position=role) self.save_legislator(leg)
def scrape_member(self, chamber, term, member_url): with self.urlopen(member_url) as page: root = lxml.html.fromstring(page) root.make_links_absolute(member_url) photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0] photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0] full_name = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@alt')[0] email = root.xpath('//a[contains(@href, "mailto")]/@href')[0] email = email.replace('mailto:','') district = root.xpath('//div[@id="District"]//div[starts-with(@class,"widgetContent")]') if len(district): district = district[0].text.strip() district = clean_district(district) else: self.logger.warning('No district tab found for this hot garbage. Skipping.') return party = root.xpath('//span[@class="legislatorAffiliation"]/text()')[0] if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' else: party = 'Other' leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url, email=email) leg.add_source(member_url) # offices for dl in root.xpath('//dl[@class="address"]'): office_name = phone = fax = email = None address = [] for child in dl.getchildren(): text = child.text_content() if child.tag == 'dt': office_name = text else: if text.startswith('Phone:'): phone = text.strip('Phone: ') or None elif text.startswith('Fax:'): fax = text.strip('Fax: ') or None elif text.startswith('Email:'): email = text.strip('Email: ') or None else: address.append(text) # all pieces collected if 'District' in office_name: otype = 'district' else: otype = 'capitol' leg.add_office(otype, office_name, phone=phone, fax=fax, address='\n'.join(address), email=email) self.save_legislator(leg)
def _scrape_senator(self, url, term): page = lxml.html.fromstring(self.get(url).text) name_district = page.xpath('//div[@class="memtitle"]/text()')[0] name, district = re.search(r'Senator (.+): District (\d+)', name_district).group(1, 2) try: party_text = re.search( r'Party: ?(.+)', page.xpath('//p[@class="meminfo"][1]')[0].text_content()) \ .group(1).strip() party = { 'Democrat': 'Democratic', 'Republican': 'Republican' }[party_text] except: # A handful of senate pages don't list the legislators' parties, so # check the parties' own listings: party = self._get_party('upper', district) legislator = Legislator(term, 'upper', district, name, party=party, url=url) legislator.add_source(url) offices_text = [ '\n'.join(line.strip() for line in office_td.itertext()) for office_td in page.xpath('//td[@class="memoffice"]') ] for office_text in offices_text: mailing_address = next( iter(re.findall( r'Mailing Address:.+?7\d{4}', office_text, flags=re.DOTALL | re.IGNORECASE)), office_text ) try: address = re.search( r'(?:\d+ |P\.?\s*O\.?).+7\d{4}', mailing_address, flags=re.DOTALL | re.IGNORECASE).group() except AttributeError: # No address was found; skip office. continue phone = extract_phone(office_text) fax = extract_fax(office_text) office_type = 'capitol' if any( zip_code in address for zip_code in ('78701', '78711') ) else 'district' office_name = office_type.title() + ' Office' legislator.add_office(office_type, office_name, address=address.strip(), phone=phone, fax=fax) self.save_legislator(legislator)
def scrape_chamber(self, chamber, term): url = "http://www.ncga.state.nc.us/gascripts/members/"\ "memberListNoPic.pl?sChamber=" if chamber == 'lower': url += 'House' else: url += 'Senate' data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute('http://www.ncga.state.nc.us') rows = doc.xpath('//div[@id="mainBody"]/table/tr') for row in rows[1:]: party, district, full_name, counties = row.getchildren() party = party.text_content().strip("()") party = party_map[party] district = district.text_content().replace("District","").strip() notice = full_name.xpath('span') if notice: notice = notice[0].text_content() # skip resigned legislators if 'Resigned' in notice or 'Deceased' in notice: continue else: notice = None link = full_name.xpath('a/@href')[0] full_name = full_name.xpath('a')[0].text_content() full_name = full_name.replace(u'\u00a0', ' ') # scrape legislator page details lhtml = self.get(link).text ldoc = lxml.html.fromstring(lhtml) ldoc.make_links_absolute('http://www.ncga.state.nc.us') photo_url = ldoc.xpath('//a[contains(@href, "pictures")]/@href')[0] phone = get_table_item(ldoc, 'Phone:') or None address = get_table_item(ldoc, 'Address:') or None email = ldoc.xpath('//a[starts-with(@href, "mailto:")]')[0] capitol_email = email.text capitol_phone = email.xpath('ancestor::tr[1]/preceding-sibling::tr[1]/td/span')[0].text capitol_address = email.xpath('ancestor::tr[1]/preceding-sibling::tr[2]/td/text()') capitol_address = [x.strip() for x in capitol_address] capitol_address = '\n'.join(capitol_address) or None capitol_phone = capitol_phone.strip() or None # save legislator legislator = Legislator(term, chamber, district, full_name, photo_url=photo_url, party=party, url=link, notice=notice) legislator.add_source(link) legislator.add_office('district', 'District Office', address=address, phone=phone) legislator.add_office('capitol', 'Capitol Office', address=capitol_address, phone=capitol_phone, email=capitol_email) self.save_legislator(legislator)
def scrape_details(self, chamber, term, leg_name, leg_link, role): if not leg_link: # Vacant post, likely: if "Vacancy" in leg_name: return raise Exception("leg_link is null. something went wrong") try: url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link url_root = os.path.dirname(url) details_page = self.get(url) root = lxml.etree.fromstring(details_page.content) party = root.xpath('string(//PARTY)') district = root.xpath('string(//DISTRICT)') photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)')) home_phone = root.xpath('string(//H_PHONE)') home_address = root.xpath('string(//H_ADDRESS)') home_address2 = root.xpath('string(//H_ADDRESS2)') home_city = root.xpath('string(//H_CITY)') home_zip = root.xpath('string(//H_ZIP)') home_address_total = "%s\n%s\n%s\n%s" % ( home_address, home_address2, home_city, home_zip) bis_phone = root.xpath('string(//B_PHONE)') capital_phone = root.xpath('string(//CAP_PHONE)') other_phone = root.xpath('string(//OTH_PHONE)') org_info = root.xpath('string(//ORG_INFO)') email_name = root.xpath('string(//EMAIL_ADDRESS)').strip() cap_room = root.xpath('string(//CAP_ROOM)') if leg_name in ('Oscar Denton', 'Lataisha Jackson', 'John G. Faulkner'): assert not party, "Remove special-casing for this Democrat without a listed party: {}".format( leg_name) party = 'Democratic' elif leg_name in ('James W. Mathis'): assert not party, "Remove special-casing for this Republican without a listed party: {}".format( leg_name) party = 'Republican' elif party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' else: raise AssertionError( "A member with no identifiable party was found: {}".format( leg_name)) leg = Legislator(term, chamber, district, leg_name, party=party, role=role, org_info=org_info, url=url, photo_url=photo) leg.add_source(url) kwargs = {} if email_name != "": if "@" in email_name: email = email_name else: email = '%s@%s.ms.gov' % (email_name, { "upper": "senate", "lower": "house" }[chamber]) kwargs['email'] = email if capital_phone != "": kwargs['phone'] = capital_phone if cap_room != "": kwargs["address"] = "Room %s\n%s" % (cap_room, CAP_ADDRESS) else: kwargs['address'] = CAP_ADDRESS leg.add_office('capitol', 'Capitol Office', **kwargs) kwargs = {} if home_phone != "": kwargs['phone'] = home_phone if home_address_total != "": kwargs['address'] = home_address_total if kwargs != {}: leg.add_office('district', 'District Office', **kwargs) self.save_legislator(leg) except scrapelib.HTTPError, e: self.warning(str(e))
def scrape_senators(self, chamber, term): session = ((int(term[0:4]) - 2009) / 2) + 124 mapping = { 'district': 1, 'first_name': 2, 'middle_name': 3, 'last_name': 4, # 'suffix': 6, 'party': 6, 'resident_county': 5, 'street_addr': 7, 'city': 8, 'state': 9, 'zip_code': 10, 'phone1': 12, 'phone2': 13, 'email': 11, } url = ( 'http://legisweb1.mainelegislature.org/wp/senate/' 'wp-content/uploads/sites/2/2013/09/%sthSenatorsList.xlsx' % session) try: fn, result = self.urlretrieve(url) except scrapelib.HTTPError: url = 'http://www.maine.gov/legis/senate/%dthSenatorsList.xls' url = url % session fn, result = self.urlretrieve(url) wb = xlrd.open_workbook(fn) sh = wb.sheet_by_index(0) for rownum in xrange(1, sh.nrows): # get fields out of mapping d = {} for field, col_num in mapping.iteritems(): try: d[field] = str(sh.cell(rownum, col_num).value) except IndexError: # This col_num doesn't exist in the sheet. pass full_name = " ".join((d['first_name'], d['middle_name'], d['last_name'])) full_name = re.sub(r'\s+', ' ', full_name).strip() address = "{street_addr}\n{city}, ME {zip_code}".format(**d) # For matching up legs with votes district_name = d['city'] phone = d['phone1'] district = d['district'].split('.')[0] leg_url = 'http://www.maine.gov/legis/senate/bio%02ds.htm' % int(district) leg = Legislator(term, chamber, district, full_name, d['first_name'], d['middle_name'], d['last_name'], _party_map[d['party']], resident_county=d['resident_county'], office_address=address, office_phone=phone, email=None, district_name=district_name, url=leg_url) leg.add_source(url) leg.add_source(leg_url) html = self.urlopen(leg_url) doc = lxml.html.fromstring(html) doc.make_links_absolute(leg_url) xpath = '//td[@class="XSP_MAIN_PANEL"]/descendant::img/@src' photo_url = doc.xpath(xpath) if photo_url: photo_url = photo_url.pop() leg['photo_url'] = photo_url else: photo_url = None office = dict( name='District Office', type='district', fax=None, email=None, address=''.join(address)) leg['email'] = d['email'] leg.add_office(**office) self.save_legislator(leg)
def scrape_upper(self, term): url = 'http://www.utahsenate.org/aspx/roster.aspx' html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath('//tr')[1:]: tds = row.xpath('td') # 1st has district district = tds[0].text_content() # 3rd has name and email person = tds[2].xpath('span[@class="person"]')[0] if '(D)' in person.text_content(): party = 'Democratic' elif '(R)' in person.text_content(): party = 'Republican' else: raise ValueError('unknown party') a = person.xpath('a')[0] name = a.text_content() leg_url = a.get('href') email = tds[2].xpath('span[@class="email"]/a/text()') if email: email = email[0] else: email = '' # office address # text is split by br in 4th td, join with a space address = ' '.join(tds[3].xpath('font/text()')) numbers = tds[4].xpath('text()') phone = None fax = None for num in numbers: if num.startswith(('Cell', 'Home', 'Work')) and not phone: phone = num.split(u'\xa0')[-1] elif num.startswith('Fax'): fax = num.split(u'\xa0')[-1] numbers = [num.split(u'\xa0') for num in numbers] # get photo try: leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) photo_url = leg_doc.xpath('//p[@class="photo"]/img/@src')[0] except: self.warning('could not fetch %s' % leg_url) photo_url = '' leg = Legislator(term, 'upper', district, name, party=party, email=email, address=address, photo_url=photo_url, url=leg_url) leg.add_office('district', 'Home', address=address, phone=phone, fax=fax) leg.add_source(url) leg.add_source(leg_url) self.save_legislator(leg)
def scrape_chamber(self, chamber, term): url = "http://www.ncga.state.nc.us/gascripts/members/"\ "memberList.pl?sChamber=" if chamber == 'lower': url += 'House' else: url += 'Senate' data = self.urlopen(url) doc = lxml.html.fromstring(data) doc.make_links_absolute('http://www.ncga.state.nc.us') rows = doc.xpath('//div[@id="mainBody"]/table/tr') for row in rows[1:]: party, district, full_name, counties = row.getchildren() party = party.text_content() party = party_map[party] district = district.text_content() notice = full_name.xpath('span') if notice: notice = notice[0].text_content() # skip resigned legislators if 'Resigned' in notice or 'Deceased' in notice: continue else: notice = None link = full_name.xpath('a/@href')[0] full_name = full_name.xpath('a')[0].text_content() full_name = full_name.replace(u'\u00a0', ' ') # scrape legislator page details lhtml = self.urlopen(link) ldoc = lxml.html.fromstring(lhtml) ldoc.make_links_absolute('http://www.ncga.state.nc.us') photo_url = ldoc.xpath('//a[contains(@href, "pictures")]/@href')[0] phone = get_table_item(ldoc, 'Phone:') address = get_table_item(ldoc, 'Legislative Mailing Address:') or None email = ldoc.xpath( '//a[starts-with(@href, "mailto:")]')[0].text or '' # save legislator legislator = Legislator(term, chamber, district, full_name, photo_url=photo_url, party=party, url=link, notice=notice, email=email) legislator.add_source(link) legislator.add_office('capitol', 'Capitol Office', address=address, phone=phone) self.save_legislator(legislator)
def _parse_member(self, chamber, term, member): first_name = member.get('first-name') last_name = member.get('last-name') party = self.party_map[member.get('party')] # this is semi-safe because we validated term w/ latest_only=True session = self.metadata['terms'][-1]['sessions'][-1] # extra_fields extra_dict = {} for name, xpath in self.extra_fields.iteritems(): result = member.xpath(xpath) if result: extra_dict[name] = result[0] # address fields for name, xpath in self.addr_fields.iteritems(): result = member.xpath(xpath) if result: result = result[0] extra_dict[name] = '%s, %s, %s %s' % ( result.get('street-address'), result.get('city'), result.get('state'), result.get('postal-code')) leg = Legislator(term, chamber, member.get('district-number'), full_name=first_name+' '+last_name, first_name=first_name, last_name=last_name, middle_name=member.get('middle-initial'), party=party, email=member.get('e-mail'), url=member.get('website'), oregon_member_id=member.get('leg-member-id')) # add offices leg.add_office('capitol', 'Capitol Office', address=extra_dict['capitol_address'], phone=extra_dict['phone']) if 'district_address' in extra_dict or 'district_phone' in extra_dict: leg.add_office('district', 'District Office', address=extra_dict.get('district_address', None), phone=extra_dict.get('district_phone', None)) # committees com_xpath = 'committee-membership/session[@session-name="%s"]/committee' % session for com in member.xpath(com_xpath): cdict = { 'position': com.get('title').lower(), 'chamber': chamber, } com_name = com.get('name') com_class = com.get('committee-class') if com_class == 'sub-committee': cdict['committee'], cdict['subcommittee'] = \ com.get('name').split(' Subcommittee On ') else: cdict['committee'] = com.get('name') leg.add_role('committee member', term, **cdict) leg.add_source(self.source_url) return leg
def scrape(self, term, chambers): leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" data = self.urlopen(leg_url) page = open_csv(data) for row in page: chamber = {'H': 'lower', 'S': 'upper'}[row['office code']] if chamber not in chambers: continue district = row['dist'].lstrip('0') name = row['first name'] mid = row['middle initial'].strip() if mid: name += " %s" % mid name += " %s" % row['last name'] suffix = row['suffix'].strip() if suffix: name += " %s" % suffix party = row['party'] if party == 'Democrat': party = 'Democratic' leg = Legislator(term, chamber, district, name, first_name=row['first name'], last_name=row['last name'], middle_name=row['middle initial'], suffixes=row['suffix'], party=party, email=row['email'].strip(), url=row['URL'], office_phone=row['capitol phone']) office_address = "%s, Room %s\nHartford, CT 06106-1591" % ( row['capitol street address'], row['room number']) leg.add_office('capitol', 'Capitol Office', address=office_address, phone=row['capitol phone']) # skipping home address for now leg.add_source(leg_url) for comm in row['committee member1'].split(';'): if comm: if ' (' in comm: comm, role = comm.split(' (') role = role.strip(')').lower() else: role = 'member' comm = comm.strip() if comm == '': continue leg.add_role('committee member', term, chamber='joint', committee=comm, position=role) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term, latest_only=False) root_url = 'http://www.capitol.tn.gov/' parties = { 'D': 'Democratic', 'R': 'Republican', 'CCR': 'Carter County Republican', 'I': 'Independent' } #testing for chamber if chamber == 'upper': url_chamber_name = 'senate' abbr = 's' else: url_chamber_name = 'house' abbr = 'h' if term != self.metadata["terms"][-1]["sessions"][0]: chamber_url = root_url + url_chamber_name chamber_url += '/archives/' + term + 'GA/Members/index.html' else: chamber_url = root_url + url_chamber_name + '/members/' page = self.urlopen(chamber_url) page = lxml.html.fromstring(page) for row in page.xpath("//tr")[1:]: # Skip any a header row. if set(child.tag for child in row) == set(['th']): continue partyInit = row.xpath('td[2]')[0].text.split()[0] party = parties[partyInit] district = row.xpath('td[4]/a')[0].text.split()[1] address = row.xpath('td[5]')[0].text_content() # 301 6th Avenue North Suite address = address.replace( 'LP', 'Legislative Plaza\nNashville, TN 37243') address = address.replace( 'WMB', 'War Memorial Building\nNashville, TN 37243') address = '301 6th Avenue North\nSuite ' + address phone = row.xpath('td[6]')[0].text #special case for Karen D. Camper if phone == None: phone = row.xpath('td[6]/div')[0].text phone = '615-' + phone.split()[0] email = row.xpath('td[7]/a')[0].text member_url = (root_url + url_chamber_name + '/members/' + abbr + district + '.html') member_photo_url = (root_url + url_chamber_name + '/members/images/' + abbr + district + '.jpg') member_page = self.urlopen(member_url) member_page = lxml.html.fromstring(member_page) name = member_page.xpath('//div[@id="membertitle"]/h2')[0].text if 'Speaker' in name: full_name = name[8:len(name)] elif 'Lt.' in name: full_name = name[13:len(name)] elif abbr == 'h': full_name = name[5:len(name)] else: full_name = name[8:len(name)] leg = Legislator(term, chamber, district, full_name.strip(), party=party, email=email, url=member_url, photo_url=member_photo_url) leg.add_source(chamber_url) leg.add_source(member_url) # TODO: add district address from this page leg.add_office('capitol', 'Nashville Address', address=address, phone=phone, email=email) self.save_legislator(leg)
def scrape(self, term, chambers): leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" page = self.get(leg_url) # Ensure that the spreadsheet's structure hasn't generally changed _row_headers = page.text.split('\r\n')[0].replace('"', '').split(',') assert _row_headers == HEADERS, "Spreadsheet structure may have changed" page = open_csv(page) for row in page: chamber = {'H': 'lower', 'S': 'upper'}[row['office code']] district = row['dist'].lstrip('0') assert district.isdigit(), "Invalid district found: {}".format(district) name = row['first name'] mid = row['middle initial'].strip() if mid: name += " %s" % mid name += " %s" % row['last name'] suffix = row['suffix'].strip() if suffix: name += " %s" % suffix party = row['party'] if party == 'Democrat': party = 'Democratic' leg = Legislator(term, chamber, district, name, party=party, url=row['URL']) office_address = "%s\nRoom %s\nHartford, CT 06106" % ( row['capitol street address'], row['room number']) email = row['email'].strip() if "@" not in email: assert email.endswith("mailform.php"), "Problematic email found: {}".format(email) email = None leg.add_office('capitol', 'Capitol Office', address=office_address, phone=row['capitol phone'], fax=(row['fax'].strip() or None), email=email) home_address = "{}\n{}, {} {}".format( row['home street address'], row['home city'], row['home state'], row['home zip code'], ) if "Legislative Office Building" not in home_address: leg.add_office('district', 'District Office', address=home_address, phone=row['home phone'] if row['home phone'].strip() else None) leg.add_source(leg_url) for comm in row['committee member1'].split(';'): if comm: if ' (' in comm: comm, role = comm.split(' (') role = role.strip(')').lower() else: role = 'member' comm = comm.strip() if comm == '': continue leg.add_role(role, term, chamber='joint', committee=comm) self.save_legislator(leg)
def scrape_legislator(self, name, chamber, term, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) party = page.xpath("string(//span[contains(@id, 'Party')])") party = party.strip() if party == 'Democrat': party = 'Democratic' district = page.xpath("string(//span[contains(@id, 'District')])") district = district.strip().lstrip('0') occupation = page.xpath( "string(//span[contains(@id, 'Occupation')])") occupation = occupation.strip() (photo_url, ) = page.xpath('//img[contains(@id, "_imgMember")]/@src') office_phone = page.xpath( "string(//span[contains(@id, 'CapitolPhone')])").strip() email = None email_link = page.xpath('//a[@id="lnkMail"]') if email_link: email = email_link[0].attrib['href'].split(":")[1] legislator = Legislator(term, chamber, district, name, party=party, occupation=occupation, photo_url=photo_url, url=url) kwargs = {} if office_phone.strip() != "": kwargs['phone'] = office_phone if email and email.strip() != "": # South Dakota protects their email addresses from scraping using # some JS code that runs on page load # Until that code is run, all their email addresses are listed as # *@example.com; so, fix this kwargs['email'] = re.sub(r'@example\.com$', '@sdlegislature.gov', email) if kwargs: legislator.add_office('capitol', 'Capitol Office', **kwargs) home_address = [ x.strip() for x in page.xpath('//td/span[contains(@id, "HomeAddress")]/text()') if x.strip() ] if home_address: home_address = "\n".join(home_address) home_phone = page.xpath( "string(//span[contains(@id, 'HomePhone')])").strip() legislator.add_office( 'district', 'District Office', address=home_address, phone=home_phone or None ) legislator.add_source(url) comm_url = page.xpath("//a[. = 'Committees']")[0].attrib['href'] self.scrape_committees(legislator, comm_url) self.save_legislator(legislator)
def scrape_senators(self, chamber, term): session = ((int(term[0:4]) - 2009) / 2) + 124 mapping = { 'district': 0, 'first_name': 2, 'middle_name': 3, 'last_name': 4, 'suffixes': 5, 'party': 1, 'street_addr': 6, 'city': 7, 'state': 8, 'zip_code': 9, 'phone1': 10, 'phone2': 11, 'email': 12 } list_location = '2014/12/127th-Senate-Members2' url = ('http://legisweb1.mainelegislature.org/wp/senate/' 'wp-content/uploads/sites/2/{}.xlsx'.format(list_location)) fn, result = self.urlretrieve(url) wb = xlrd.open_workbook(fn) sh = wb.sheet_by_index(0) for rownum in xrange(1, sh.nrows): # get fields out of mapping d = {} for field, col_num in mapping.iteritems(): try: d[field] = str(sh.cell(rownum, col_num).value).strip() except IndexError: # This col_num doesn't exist in the sheet. pass full_name = " ".join( (d['first_name'], d['middle_name'], d['last_name'])) full_name = re.sub(r'\s+', ' ', full_name).strip() address = "{street_addr}\n{city}, ME {zip_code}".format(**d) # For matching up legs with votes district_name = d['city'] phone = d['phone1'] if not phone: phone = d['phone2'] if not phone: phone = None district = d['district'].split('.')[0] # Determine legislator's URL to get their photo LEGISLATOR_ROSTER_URL = \ 'http://legisweb1.mainelegislature.org/wp/senate/senators/' html = self.get(LEGISLATOR_ROSTER_URL).text doc = lxml.html.fromstring(html) doc.make_links_absolute(LEGISLATOR_ROSTER_URL) URL_XPATH = '//address[contains(text(), "(District {})")]/a/@href'. \ format(district) (leg_url, ) = doc.xpath(URL_XPATH) leg = Legislator(term, chamber, district, full_name, first_name=d['first_name'], middle_name=d['middle_name'], last_name=d['last_name'], party=d['party'], suffixes=d['suffixes'], district_name=district_name, url=leg_url) leg.add_source(url) leg.add_source(leg_url) html = self.get(leg_url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(leg_url) xpath = '//img[contains(@src, ".png")]/@src' photo_url = doc.xpath(xpath) if photo_url: photo_url = photo_url.pop() leg['photo_url'] = photo_url else: photo_url = None office = dict(name='District Office', type='district', phone=phone, fax=None, email=d['email'], address=address) leg['email'] = d['email'] leg.add_office(**office) self.save_legislator(leg)
def scrape(self, chamber, term): url = self.URLs[chamber] page = self.lxmlize(url) for block in page.xpath("//div[@class='ms-rtestate-field']")[1:-1]: # Each legislator block. photo_block = block.xpath("ancestor::td/preceding-sibling::td") if len(photo_block) == 0: continue photo_block, = photo_block # (The <td> before ours was the photo) img, = photo_block.xpath("*") img = img.attrib['src'] h2, = block.xpath(".//h2/a") name = h2.text info = {} # Right, now let's get info out of their little profile box. for entry in block.xpath(".//p"): for kvpair in itergraphs(entry.xpath("./*"), 'br'): # OK. We either get the tail or the next element # (usually an <a> tag) if len(kvpair) == 1: key, = kvpair value = key.tail.strip() if key.tail else None if value: value = re.sub("\s+", " ", value).strip() elif len(kvpair) == 2: key, value = kvpair else: # Never seen text + an <a> tag, perhaps this can happen. raise ValueError( "Too many elements. Something changed") key = key.text_content().strip(" :") if value is None: # A page has the value in a <strong> tag. D'oh. key, value = (x.strip() for x in key.rsplit(":", 1)) key = re.sub("\s+", " ", key).strip() info[key] = value info['District'] = info['District'].encode('ascii', 'ignore').strip() info['Party'] = info['Party'].strip(": ") leg = Legislator(term=term, url=h2.attrib['href'], chamber=chamber, full_name=name, party=info['Party'], district=info['District'], photo_url=img) leg.add_source(url) phone = info.get('Capitol Phone', info.get('apitol Phone')) if hasattr(phone, 'text_content'): phone = phone.text_content() leg.add_office(type='capitol', name='Capitol Office', address=info['Capitol Address'], phone=phone, email=info['Email'].attrib['href'].replace( "mailto:", "")) self.save_legislator(leg)
def scrape(self, chamber, term): if chamber == 'upper': index_url = 'http://www.leg.wa.gov/senate/senators/Pages/default.aspx' else: index_url = 'http://www.leg.wa.gov/house/representatives/Pages/default.aspx' doc = self.lxmlize(index_url) # Email addresses are listed on a separate page. email_list_url = 'http://app.leg.wa.gov/memberemail/Default.aspx' email_doc = self.lxmlize(email_list_url) for member in doc.xpath( '//div[@id="allMembers"]/div[@class="memberInformation"]'): (photo_url, ) = member.xpath('.//a[text()="Print Quality Photo"]/@href') (title_name_party, ) = member.xpath('.//span[@class="memberName"]/text()') (name, party) = re.search( r'^(?:Senator|Representative)\s(.+)\s\(([RD])\)$', title_name_party).groups() if party == 'R': party = "Republican" elif party == 'D': party = "Democratic" ( district_name, _district_name, ) = member.xpath( './/a[contains(text(), " Legislative District")]/text()') assert district_name == _district_name district_num = re.search(r'(\d{1,2})\w{2} Legislative District', district_name).group(1) leg = Legislator(full_name=name, term=term, chamber=chamber, district=district_num, party=party, photo_url=photo_url) leg['url'] = member.xpath( './/a[contains(text(), "Home Page")]/@href')[0] capitol_office = member.xpath( './/div[@class="memberColumnTitle" and text()=" Olympia Office"]/parent::div[1]/text()' ) capitol_office = [l.strip() for l in capitol_office if l.strip()] capitol_fax = None capitol_phone = None capitol_address = None # Can't capture any information anyway if office data is empty, # so we can skip if that's the case. if capitol_office: # Retrieve capitol office fax number. if capitol_office[-1].startswith('Fax: '): capitol_fax = capitol_office.pop().replace('Fax: ', "") # Retrieve capitol office phone number. capitol_phone = capitol_office.pop() # Retrieve capitol office address. capitol_address = '\n'.join(capitol_office) # Retrieve the member's position from the email link. We need it to find the member's email address. # These positions are enough to discriminate the chamber too (0 = upper, 1,2 = lower) email_link_url = member.xpath( './/a[contains(@href, "memberEmail")]')[0].get('href') position = re.search(r'/([[0-9]+)$', email_link_url).group(1) # Need to get the email from the email page by matching with the member's district and position email = self.get_node( email_doc, './/tr/td/a[contains(@href, "memberEmail/{}/{}")]/parent::td/' 'following-sibling::td[1]/text()'.format( district_num, position)).strip() leg.add_office('capitol', 'Capitol Office', address=capitol_address, phone=capitol_phone, email=email, fax=capitol_fax) _has_district_office = member.xpath( './/div[@class="memberColumnTitle" and text()=" District Office"]' ) if _has_district_office: # Out of both chambers, only one member has multiple district offices, so ignore that # Also ignore the few members who have separate mailing addresses district_office = member.xpath( './/div[@class="memberColumnTitle" and text()=" District Office"]/parent::div[1]/text()' ) district_office = [ l.strip() for l in district_office if l.strip() ] _end_of_first_address = district_office.index([ l for l in district_office if re.search(r'\,\s*WA\s*\d{5}', l) ][0]) district_address = '\n'.join( district_office[0:(_end_of_first_address + 1)]) try: district_phone = district_office[(_end_of_first_address + 1)] assert re.match(r'\(\d{3}\) \d{3} \- \d{4}', district_phone) except IndexError: pass except AssertionError: pass leg.add_office('district', 'District Office', address=district_address, phone=district_phone) leg.add_source(index_url) self.save_legislator(leg)
def scrape(self, term, chambers): url = 'http://gencourt.state.nh.us/downloads/Members.txt' option_map = {} html = self.urlopen('http://www.gencourt.state.nh.us/house/members/memberlookup.aspx') doc = lxml.html.fromstring(html) for opt in doc.xpath('//option'): option_map[opt.text] = opt.get('value') data = self.urlopen(url) for line in data.splitlines(): if line.strip() == "": continue (chamber, fullname, last, first, middle, county, district_num, seat, party, street, street2, city, astate, zipcode, home_phone, office_phone, fax, email, com1, com2, com3, com4, com5, com6, com7) = line.split('*') chamber = chamber_map[chamber] # skip legislators from a chamber we aren't scraping if chamber not in chambers: continue if middle: full = '%s %s %s' % (first, middle, last) else: full = '%s %s' % (first, last) address = street if street2: address += (' ' + street2) address += '\n%s, %s %s' % (city, astate, zipcode) district = str(int(district_num)) if county: district = '%s %s' % (county, district) leg = Legislator(term, chamber, district, full, first, last, middle, party_map[party], email=email) leg.add_office('district', 'Home Address', address=address, phone=home_phone or None) leg.add_office('district', 'Office Address', phone=office_phone or None, fax=fax or None) if chamber == 'upper': leg['url'] = 'http://www.gencourt.state.nh.us/Senate/members/webpages/district%02d.aspx' % int(district_num) elif chamber == 'lower': code = option_map.get('{0}, {1}'.format(last, first)) if code: leg['url'] = 'http://www.gencourt.state.nh.us/house/members/member.aspx?member=' + code romans = r'(?i)\s([IXV]+)(?:\s|$)' for com in (com1, com2, com3, com4, com5, com6, com7): com = com.strip('"') if com: com_name = com.title() com_name = re.sub(romans, lambda m: m.group().upper(), com_name) leg.add_role('committee member', term=term, chamber=chamber, committee=com_name) if 'url' in leg: leg['photo_url'] = self.get_photo(leg['url'], chamber) leg.add_source(url) self.save_legislator(leg)
def scrape_member(self, chamber, term, member_url): page = self.get(member_url).text root = lxml.html.fromstring(page) root.make_links_absolute(member_url) photo_url = root.xpath('//div[@class="thumbPhoto"]/img/@src')[0] full_name = root.xpath('//h1/span')[0].tail.strip() try: email = root.xpath('//a[contains(@href, "mailto")]/@href')[0] email = email.replace('mailto:', '') except: email = '' self.info("seat may be vacant") party, district = root.xpath('//h1/span')[1].text.split('-') party = party.strip() district = clean_district(district.strip()) if party in ('D', 'Democrat', 'Democratic'): party = 'Democratic' elif party in ('R', 'Republican'): party = 'Republican' else: party = 'Other' leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url) leg.add_source(member_url) # offices # this bool is so we only attach the email to one office # and we make sure to create at least one office email_stored = True if email: email_stored = False for addr in root.xpath('//address/div[@class="contactGroup"]'): office_name = addr.xpath('../preceding-sibling::h4/text()' )[0].strip() address = addr.xpath('a')[0].text_content() address = re.sub('\s{2,}', '\n', address) phone = fax = next = None for phonerow in addr.xpath('./div/div'): phonerow = phonerow.text_content().strip() if phonerow == 'Phone:': next = 'phone' elif phonerow == 'Fax:': next = 'fax' elif next == 'phone': phone = phonerow next = None elif next == 'fax': fax = phonerow next = None else: self.warning('unknown phonerow %s', phonerow) # all pieces collected if 'District' in office_name: otype = 'district' elif 'State' in office_name: otype = 'capitol' if not email_stored: email_stored = True leg.add_office(otype, office_name, phone=phone, fax=fax, address=address, email=email) else: leg.add_office(otype, office_name, phone=phone, fax=fax, address=address) if not email_stored: leg.add_office('capitol', 'Capitol Office', email=email) self.save_legislator(leg)
def scrape(self, chamber, term): # What Vermont claims are Word and Excel files are actually # just HTML tables # What Vermont claims is a CSV file is actually one row of comma # separated values followed by a ColdFusion error. url = ("http://www.leg.state.vt.us/legdir/" "memberdata.cfm/memberdata.doc?FileType=W") page = self.urlopen(url) page = lxml.html.fromstring(page) for tr in page.xpath("//tr")[1:]: row_chamber = tr.xpath("string(td[4])") if row_chamber == 'S' and chamber == 'lower': continue elif row_chamber == 'H' and chamber == 'upper': continue district = tr.xpath("string(td[7])") district = district.replace('District', '').strip() if not district: continue first_name = tr.xpath("string(td[8])") middle_name = tr.xpath("string(td[9])") last_name = tr.xpath("string(td[10])") if first_name.endswith(" %s." % middle_name): first_name = first_name.split(" %s." % middle_name)[0] if middle_name: full_name = "%s %s. %s" % (first_name, middle_name, last_name) else: full_name = "%s %s" % (first_name, last_name) email = tr.xpath("string(td[11])") party = tr.xpath("string(td[6])") party = re.sub(r'Democrat\b', 'Democratic', party) parties = party.split('/') if 'Republican' in parties: if 'Democratic' in parties: pass else: party = 'Republican' parties.remove('Republican') elif 'Democratic' in parties: party = 'Democratic' parties.remove('Democratic') else: party = parties.pop(0) leg = Legislator( term, chamber, district, full_name, first_name=first_name, middle_name=middle_name, last_name=last_name, party=party, email=email, # closest thing we have to a page for legislators, not ideal url='http://www.leg.state.vt.us/legdir/LegDirMain.cfm') leg['roles'][0]['other_parties'] = parties leg.add_source(url) # 12-16: MailingAddress: 1,2,City,State,ZIP mail = '%s\n%s\n%s, %s %s' % ( tr.xpath('string(td[12])'), tr.xpath('string(td[13])'), tr.xpath('string(td[14])'), tr.xpath('string(td[15])'), tr.xpath('string(td[16])')) leg.add_office('district', 'Mailing Address', address=mail) # 17-21: HomeAddress: 1,2,City,State,ZIP, Email, Phone home = '%s\n%s\n%s, %s %s' % ( tr.xpath('string(td[17])'), tr.xpath('string(td[18])'), tr.xpath('string(td[19])'), tr.xpath('string(td[20])'), tr.xpath('string(td[21])')) home_email = tr.xpath('string(td[22])') or None home_phone = tr.xpath('string(td[23])') or None leg.add_office('district', 'Home Address', address=home, email=home_email, phone=home_phone) self.save_legislator(leg)
def fetch_member(self, url, name, term, chamber): if name in CHAMBER_MOVES: if chamber != CHAMBER_MOVES[name]: return # Skip bad chambers. party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'} party_district_re = re.compile( r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)') # handle resignations, special elections match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name) if match: action, date = match.groups() name = name.rsplit('-')[0] if action == 'Resigned': pass # TODO: set end date elif action == 'Member': pass # TODO: set start date html = self.urlopen(url) doc = lxml.html.fromstring(html) party_district_line = doc.xpath('//h3/font/text()')[0] party, district = party_district_re.match(party_district_line).groups() leg = Legislator(term, chamber, district, name.strip(), party=party_map[party], url=url) leg.add_source(url) for ul in doc.xpath('//ul[@class="linkNon"]'): address = [] phone = None email = None for li in ul.getchildren(): text = li.text_content() if re.match('\(\d{3}\)', text): phone = text elif text.startswith('email:'): email = text.strip('email: ').strip() else: address.append(text) type = ('capitol' if 'Capitol Square' in address else 'district') name = ('Capitol Office' if type == 'capitol' else 'District Office') leg.add_office(type, name, address='\n'.join(address), phone=phone, email=email) for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'): leg.add_role('committee member', term=term, chamber=chamber, committee=com) self.save_legislator(leg)
def scrape(self, chamber, term): chamber_abbrev = {'upper': 'S', 'lower': 'H'}[chamber] url = ("http://legisweb.state.wy.us/LegislatorSummary/LegislatorList" ".aspx?strHouse=%s&strStatus=N" % chamber_abbrev) page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) for link in page.xpath("//a[contains(@href, 'LegDetail')]"): name = link.text.strip() leg_url = link.get('href') email_address = link.xpath("../../../td[1]//a")[0].attrib['href'] email_address = link.xpath("../../../td[2]//a")[0].attrib['href'] email_address = email_address.split('Mailto:')[1] party = link.xpath("string(../../../td[3])").strip() if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' district = link.xpath("string(../../../td[4])").strip().lstrip( 'HS0') leg_page = lxml.html.fromstring(self.urlopen(leg_url)) leg_page.make_links_absolute(leg_url) img = leg_page.xpath( "//img[contains(@src, 'LegislatorSummary/photos')]")[0] photo_url = img.attrib['src'] office_tds = leg_page.xpath( '//table[@id="ctl00_cphContent_tblContact"]/tr/td/text()') address = [] phone = None cell = None fax = None for td in office_tds: if td.startswith('Home -'): phone = td.strip('Home - ') if td.startswith('Cell -') and not phone: phone = td.strip('Cell - ') if td.startswith('Fax -'): fax = td.strip('Fax - ') elif ' - ' not in td: address.append(td) leg = Legislator(term, chamber, district, name, party=party, email=email_address, photo_url=photo_url, url=leg_url) adr = " ".join(address) if adr.strip() != "": leg.add_office('district', 'Contact Information', cell=cell, address=adr, phone=phone, fax=fax) leg.add_source(url) leg.add_source(leg_url) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term) session = self.get_session_for_term(term) try: session_id = self.get_session_id(session) except KeyError: raise NoDataForPeriod(session) body = {'lower': 'H', 'upper': 'S'}[chamber] url = 'http://www.azleg.gov/MemberRoster.asp?Session_ID=%s&body=%s' % ( session_id, body) with self.urlopen(url) as page: root = html.fromstring(page) path = '//table[@id="%s"]/tr' % {'H': 'house', 'S': 'senate'}[body] roster = root.xpath(path)[1:] for row in roster: position = '' vacated = '' name, district, party, email, room, phone, fax = row.xpath( 'td') if email.attrib.get('class') == 'vacantmember': continue # Skip any vacant members. link = name.xpath('string(a/@href)') link = "http://www.azleg.gov" + link if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() district = district.text_content() party = party.text_content().strip() email = email.text_content().strip() if ('Vacated' in email or 'Resigned' in email or 'Removed' in email): # comment out the following 'continue' for historical # legislative sessions # for the current session, if a legislator has left we will # skip him/her to keep from overwriting their information continue vacated = re.search('[0-9]*/[0-9]*/\d{4}', email).group() email = '' party = self.get_party(party) room = room.text_content().strip() if chamber == 'lower': address = "House of Representatives\n" else: address = "Senate\n" address = address + "1700 West Washington\n Room " + room \ + "\nPhoenix, AZ 85007" phone = phone.text_content().strip() if not phone.startswith('602'): phone = "602-" + phone fax = fax.text_content().strip() if not fax.startswith('602'): fax = "602-" + fax if vacated: end_date = datetime.datetime.strptime(vacated, '%m/%d/%Y') leg = Legislator(term, chamber, district, full_name=name, party=party, url=link) leg['roles'][0]['end_date'] = end_date else: leg = Legislator(term, chamber, district, full_name=name, party=party, email=email, url=link) leg.add_office('capitol', 'Capitol Office', address=address, phone=phone, fax=fax) if position: leg.add_role(position, term, chamber=chamber, district=district, party=party) leg.add_source(url) #Probably just get this from the committee scraper #self.scrape_member_page(link, session, chamber, leg) self.save_legislator(leg)
def scrape(self, chamber, term): biennium = "%s-%s" % (term[0:4], term[7:9]) url = ("http://wslwebservices.leg.wa.gov/SponsorService.asmx/" "GetSponsors?biennium=%s" % biennium) # these pages are useful for checking if a leg is still in office if chamber == 'upper': cur_members = self.urlopen( 'http://www.leg.wa.gov/senate/senators/Pages/default.aspx') else: cur_members = self.urlopen( 'http://www.leg.wa.gov/house/representatives/Pages/default.aspx' ) page = self.urlopen(url) page = lxml.etree.fromstring(page.bytes) for member in xpath(page, "//wa:Member"): mchamber = xpath(member, "string(wa:Agency)") mchamber = {'House': 'lower', 'Senate': 'upper'}[mchamber] if mchamber != chamber: continue name = xpath(member, "string(wa:Name)").strip() # if the legislator isn't in the listing, skip them if name not in cur_members: self.warning('%s is no longer in office' % name) continue party = xpath(member, "string(wa:Party)") party = {'R': 'Republican', 'D': 'Democratic'}.get(party, party) district = xpath(member, "string(wa:District)") if district == '0': # Skip phony district 0. continue email = xpath(member, "string(wa:Email)") leg_id = xpath(member, "string(wa:Id)") phone = xpath(member, "string(wa:Phone)") last = xpath(member, "string(wa:LastName)") last = last.lower().replace(' ', '') if chamber == 'upper': leg_url = ("http://www.leg.wa.gov/senate/senators/" "Pages/%s.aspx" % last) else: leg_url = ("http://www.leg.wa.gov/house/" "representatives/Pages/%s.aspx" % last) scraped_offices = [] try: leg_page = self.urlopen(leg_url) leg_page = lxml.html.fromstring(leg_page) leg_page.make_links_absolute(leg_url) photo_link = leg_page.xpath( "//a[contains(@href, 'publishingimages')]") if photo_link: photo_url = photo_link[0].attrib['href'] offices = leg_page.xpath( "//table[@cellspacing='0']/tr/td/b[contains(text(), 'Office')]" ) for office in offices: office_block = office.getparent() office_name = office.text_content().strip().rstrip(":") address_lines = [ x.tail for x in office_block.xpath(".//br") ] address_lines = filter(lambda a: a is not None, address_lines) phone = address_lines.pop(len(address_lines) - 1) address = "\n".join(address_lines) obj = {"name": office_name, "phone": phone} if address.strip() != '': obj['address'] = address scraped_offices.append(obj) except scrapelib.HTTPError: # Sometimes the API and website are out of sync # with respect to legislator resignations/appointments photo_url = '' leg = Legislator(term, chamber, district, name, '', '', '', party, _code=leg_id, photo_url=photo_url, url=leg_url) leg.add_source(leg_url) for office in scraped_offices: typ = 'district' if 'District' in office['name'] else 'capitol' leg.add_office(typ, office.pop('name'), **office) self.save_legislator(leg)
def _scrape_individual_legislator_page(self, url, term, chamber, district=None): """Scrape a specific lower house legislators page. The function will actually call one of three functions as there is 2 different bio templates and a completely separate one for the speaker of the house. Example url: http://www1.legis.ga.gov/legis/2009_10/house/bios/abdulsalaamRoberta/abdulsalaamRoberta.htm """ if 'speaker/index.htm' in url: return self._scrape_speaker_of_the_house(url, term, chamber) with self.lxml_context(url) as page: # page == None == 404 if page is None: return None page.make_links_absolute(url) # first check to see if this is the 'original' template or the new one stylesheet_path = '//link[@rel="stylesheet"]' stylesheets = page.xpath(stylesheet_path) for style_sheet in stylesheets: if 'legis.ga.gov.house.factsheet.css' in style_sheet.get('href') or \ 'legis.ga.gov.house.bio.css' in style_sheet.get('href'): return self._scrape_individual_legislator_page_second_template(page, term, chamber, district=district) path = '//table[@id="hoverTable"]/tr' legislator_info = page.xpath(path) # There is one page, "www1.legis.ga.gov/legis/2011_12/house/bios/williamsCoach.htm" that has # malformed HTML, going to manually do that one: if "www1.legis.ga.gov/legis/2011_12/house/bios/williamsCoach.htm" in url: legislator = Legislator(term, chamber, district, '"Coach" Williams', party="Democratic", url=url) legislator.add_source(url) return legislator # See if we got to the first row, some templates don't start with their table as 'hoverTable' # in this case let's just get the first table on the page as that is seeming to work well. if not legislator_info: path = '//table' tables = page.xpath(path) legislator_info = tables[0].getchildren() first_row = legislator_info[0] td_elements = first_row.getchildren()[0] name = td_elements[0].text_content().split('\n')[0].strip() party = td_elements[1].text_content().strip()[0:1].upper() # There was some cases where the party wasn't in a <p> it was after the # <h2>name</h2> foo <br />, seriously wtf if party not in self.PARTY_DICT: elements = td_elements.text_content().split('\n') for ele in elements: ele = ele.strip() if " - " in ele: party = ele[0:1] break elif ele.upper() == 'REPUBLICAN': party = 'R' break elif ele.upper() == 'DEMOCRAT': party = 'D' break if party == '': party = td_elements.text_content().split('\n')[1].strip()[0:1] if not district: if len(td_elements) < 3 or "District" not in td_elements[2].text_content(): text_content = first_row[1].text_content().split('\n') district = text_content[0].strip()[len("District "):] else: district = td_elements[2].text_content().strip()[len("District "):] # Not every legislator has a sworn in date or facebook url, so attempt to parse # and just pass if it fails sworn_in = None try: sworn_in = td_elements[4].text_content().strip()[len("Sworn in "):] except: pass facebook_url = '' try: facebook_url = td_elements[5].get('href') except: pass photo_url = '' try: td_elements = first_row.getchildren()[1] photo_url = td_elements[0].getchildren()[0].get('src') or '' except: pass # Second row: second_row = legislator_info[1] address_info = second_row.getchildren()[0].text_content().split("<br />")[0].split("\n") phone_number = address_info.pop() address = " ".join(address_info) email = '' try: text_content = second_row.text_content().split('\n') for content in text_content: if '@' in content.strip(): email = content.strip() except IndexError: try: email = second_row.getchildren()[1].getchildren()[0].text_content() except: pass legislator = Legislator(term, chamber, district, name, party=self.PARTY_DICT[party], email=email, photo_url=photo_url, facebook_url=facebook_url, sworn_in_date=sworn_in, url = url) legislator.add_office('capitol', 'Capitol Address', address=address_info, phone=phone_number) legislator.add_source(url) return legislator
def scrape(self, chamber, term): self.validate_term(term, latest_only=False) root_url = 'http://www.capitol.tn.gov/' parties = { 'D': 'Democratic', 'R': 'Republican', 'CCR': 'Carter County Republican', 'I': 'Independent' } #testing for chamber if chamber == 'upper': url_chamber_name = 'senate' abbr = 's' else: url_chamber_name = 'house' abbr = 'h' if term != self.metadata["terms"][-1]["sessions"][0]: chamber_url = root_url + url_chamber_name chamber_url += '/archives/' + term + 'GA/Members/index.html' else: chamber_url = root_url + url_chamber_name + '/members/' page = self.get(chamber_url).text page = lxml.html.fromstring(page) for row in page.xpath("//tr"): # Skip any a header row. if set(child.tag for child in row) == set(['th']): continue vacancy_check = row.xpath('./td/text()')[1] if 'Vacant' in vacancy_check: self.logger.warning("Vacant Seat") continue partyInit = row.xpath('td[3]')[0].text.split()[0] party = parties[partyInit] district = row.xpath('td[5]/a')[0].text.split()[1] address = row.xpath('td[6]')[0].text_content() # 301 6th Avenue North Suite address = address.replace( 'LP', 'Legislative Plaza\nNashville, TN 37243') address = address.replace( 'WMB', 'War Memorial Building\nNashville, TN 37243') address = '301 6th Avenue North\nSuite ' + address phone = [ x.strip() for x in row.xpath('td[7]//text()') if x.strip() ][0] email = HTMLParser.HTMLParser().unescape( row.xpath('td[1]/a/@href')[0][len("mailto:"):]) member_url = (root_url + url_chamber_name + '/members/' + abbr + district + '.html') member_photo_url = (root_url + url_chamber_name + '/members/images/' + abbr + district + '.jpg') try: member_page = self.get(member_url, follow_redirects=False).text except TypeError: try: member_page = self.get(member_url).text except HTTPError: self.logger.warning("page doesn't exist") continue member_page = lxml.html.fromstring(member_page) try: name = member_page.xpath('body/div/div/h1/text()')[0] except IndexError: name = member_page.xpath( '//div[@id="membertitle"]/h2/text()')[0] if 'Speaker' in name: full_name = name[8:len(name)] elif 'Lt.' in name: full_name = name[13:len(name)] elif abbr == 'h': full_name = name[len("Representative "):len(name)] else: full_name = name[8:len(name)] leg = Legislator(term, chamber, district, full_name.strip(), party=party, url=member_url, photo_url=member_photo_url) leg.add_source(chamber_url) leg.add_source(member_url) # TODO: add district address from this page leg.add_office('capitol', 'Nashville Address', address=address, phone=phone, email=email) self.save_legislator(leg)
def scrape_reps(self, chamber, term_name): url = 'http://www.maine.gov/legis/house/dist_mem.htm' page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) # These do not include the non-voting tribal representatives # They do not have numbered districts, and lack a good deal of # the standard profile information about representatives districts = [ x for x in page.xpath('/html/body/p') if len(x.xpath('a')) == 3 ] for district in districts: if "- Vacant" in district.text_content(): self.warning("District is vacant: '{}'".format( district.text_content())) continue district_number = district.xpath('a[1]/@name')[0] leg_url = district.xpath('a[3]/@href')[0] leg_info = district.xpath('a[3]/text()')[0] INFO_RE = r''' Representative\s (?P<member_name>.+?) \s\( (?P<party>[DRUI]) - (?P<district_name>.+?) \) ''' info_search = re.search(INFO_RE, leg_info, re.VERBOSE) member_name = info_search.group('member_name') party = _party_map[info_search.group('party')] district_name = info_search.group('district_name') leg = Legislator(term_name, chamber, str(district_number), member_name, party=party, url=leg_url, district_name=district_name) leg.add_source(url) leg.add_source(leg_url) # Get the photo url. html = self.get(leg_url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(leg_url) (photo_url, ) = doc.xpath('//img[contains(@src, ".jpg")]/@src') leg['photo_url'] = photo_url # Add contact information from personal page office_address = re.search(r'<B>Address: </B>(.+?)<P>', html, re.IGNORECASE).group(1) office_email = doc.xpath( '//a[starts-with(@href, "mailto:")]/text()') if office_email: office_email = office_email[0] else: office_email = None business_phone = re.search(r'<B>Business Telephone: </B>(.+?)<P>', html, re.IGNORECASE) home_phone = re.search(r'<B>Home Telephone: </B>(.+?)<P>', html, re.IGNORECASE) cell_phone = re.search(r'<B>Cell Telephone: </B>(.+?)<P>', html, re.IGNORECASE) if business_phone: office_phone = business_phone.group(1) elif home_phone: office_phone = home_phone.group(1) elif cell_phone: office_phone = cell_phone.group(1) else: office_phone = None district_office = { 'name': "District Office", 'type': "district", 'address': office_address, 'fax': None, 'email': office_email, 'phone': office_phone } leg.add_office(**district_office) # Add state party office to member's addresses if party == "Democratic": DEM_PARTY_OFFICE = dict(name='House Democratic Office', type='capitol', address='\n'.join([ 'Room 333, State House', '2 State House Station', 'Augusta, Maine 04333-0002' ]), fax=None, email=None, phone='(207) 287-1430') leg.add_office(**DEM_PARTY_OFFICE) elif party == "Republican": REP_PARTY_OFFICE = dict(name='House GOP Office', type='capitol', address='\n'.join([ 'Room 332, State House', '2 State House Station', 'Augusta, Maine 04333-0002' ]), fax=None, email=None, phone='(207) 287-1440') leg.add_office(**REP_PARTY_OFFICE) # Save legislator self.save_legislator(leg)
def scrape_legislator_page(self, term, url): page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) name = page.xpath("//h1[@id='page-title']/text()")[0] district = page.xpath("//a[contains(@href, 'district')]/text()")[0] district = district.replace("District", "").strip() committees = page.xpath("//a[contains(@href, 'committees')]/text()") party = page.xpath("//div[contains(text(), 'Political Party')]" )[0].getnext().text_content().strip() photo = page.xpath("//div[@class='field-person-photo']/img/@src") photo = photo[0] if len(photo) else None address = page.xpath("//div[@class='adr']")[0] address = re.sub("\s+", " ", address.text_content()).strip() item_mapping = { "email": "email", "home telephone": "home-telephone", "cellphone": "cellphone", "office telephone": "office-telephone", "political party": "party", "chamber": "chamber", "fax": "fax" } metainf = {} for block in page.xpath( "//div[contains(@class, 'field-label-inline')]"): label, items = block.xpath("./*") key = label.text_content().strip().lower() if key.endswith(":"): key = key[:-1] metainf[item_mapping[key]] = items.text_content().strip() chamber = {"Senate": "upper", "House": "lower"}[metainf['chamber']] kwargs = { "party": { "Democrat": "Democratic", "Republican": "Republican" }[metainf['party']] } if photo: kwargs['photo_url'] = photo leg = Legislator(term, chamber, district, name, **kwargs) kwargs = {"address": address, "url": url} for key, leg_key in [ ('email', 'email'), ('home-telephone', 'home_phone'), ('cellphone', 'cellphone'), ('fax', 'fax'), ('office-telephone', 'office_phone'), ]: if key in metainf: kwargs[leg_key] = metainf[key] leg.add_office('district', 'District Office', **kwargs) #for committee in committees: # leg.add_role('committee member', # term=term, # chamber=chamber, # committee=committee) leg.add_source(url) self.save_legislator(leg)
def scrape_legislator(self, chamber, term, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # most properties are easy to pull optional = ["home_phone"] properties = { 'start_year': 'lblStartYear', 'district': "linkDistrict", 'occupation': "lblOccupation", 'header': "lblHeader", 'addr_street': "lblAddress", 'office_phone': ["lblCapitolPhone", "lblOfficePhone"], 'home_phone': "lblHomePhone", # '': "", # '': "", # '': "", # '': "", } for key, value in properties.iteritems(): if isinstance(value, list): values = value else: values = [value] found = False for value in values: id_ = 'ctl00_mainCopy_formViewLegislator_%s' % value try: val = "\n".join(doc.get_element_by_id(id_).itertext()) found = True except KeyError: pass if val: properties[key] = val.strip() else: properties[key] = None if found is False and key not in optional: self.warning('bad legislator page %s missing %s' % (url, id_)) return # image & email are a bit different properties['photo_url'] = doc.xpath( '//img[@id="ctl00_mainCopy_formViewLegislator_imgLegislator"]/@src' )[0] email = doc.get_element_by_id( 'ctl00_mainCopy_formViewLegislator_linkEmail').text if email: properties['email'] = email.strip() properties['url'] = url properties['chamber'] = chamber properties['term'] = term full_name, party = properties['header'].rsplit("-", 1) properties['full_name'] = full_name properties['party'] = party if '(D)' in properties['party']: properties['party'] = 'Democratic' elif '(R)' in properties['party']: properties['party'] = 'Republican' elif '(DTS)' in properties['party']: # decline to state = independent properties['party'] = 'Independent' else: raise Exception("unknown party encountered") address = properties.pop('addr_street') phone = (properties.pop('office_phone') or properties.pop('home_phone')) leg = Legislator(**properties) leg.add_source(url) leg.add_office('district', 'District Address', address=address, phone=phone) # committees # skip first header row for row in doc.xpath( '//table[@id="ctl00_mainCopy_MembershipGrid"]/tr')[1:]: role, committee, note = [x.text_content() for x in row.xpath('td')] committee = committee.title() if 'Interim' in note: role = 'interim ' + role.lower() else: role = role.lower() leg.add_role('committee member', term, committee=committee, position=role, chamber=chamber) # Already have the photo url. try: del leg['image_url'] except KeyError: pass self.save_legislator(leg)
def scrape(self, term, chambers): year_slug = term[5:] # Load all members via the private API legislator_dump_url = ( 'http://legislature.vermont.gov/people/loadAll/{}'.format( year_slug)) json_data = self.get(legislator_dump_url).text legislators = json.loads(json_data)['data'] # Parse the information from each legislator for info in legislators: # Strip whitespace from strings info = {k: v.strip() for k, v in info.iteritems()} # Gather photo URL from the member's page member_url = ( 'http://legislature.vermont.gov/people/single/{}/{}'.format( year_slug, info['PersonID'])) page = self.lxmlize(member_url) (photo_url, ) = page.xpath('//img[@class="profile-photo"]/@src') # Also grab their state email address state_email = page.xpath( '//dl[@class="summary-table profile-summary"]/' 'dt[text()="Email"]/following-sibling::dd[1]/a/text()') if state_email: (state_email, ) = state_email else: state_email = None leg = Legislator( term=term, chamber=self.CHAMBERS[info['Title']], district=info['District'].replace(" District", ""), party=info['Party'].replace("Democrat", "Democratic"), full_name="{0} {1}".format(info['FirstName'], info['LastName']), photo_url=photo_url) leg.add_office( type='capitol', name='Capitol Office', address= 'Vermont State House\n115 State Street\nMontpelier, VT 05633', email=state_email) leg.add_office(type='district', name='District Office', address="{0}{1}\n{2}, {3} {4}".format( info['MailingAddress1'], ("\n" + info['MailingAddress2'] if info['MailingAddress2'].strip() else ""), info['MailingCity'], info['MailingState'], info['MailingZIP']), phone=(info['HomePhone'].strip() or None), email=(info['Email'].strip() or info['HomeEmail'].strip() or info['WorkEmail'].strip() or None)) leg.add_source(legislator_dump_url) leg.add_source(member_url) self.save_legislator(leg)
def scrape(self, chamber, term): for tdata in self.metadata['terms']: if term == tdata['name']: year = tdata['start_year'] session_number = tdata['session_number'] break # Fetch the csv. url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \ (session_number, year, chamber == 'upper' and 'Senate' or 'House') # Parse it. data = self.urlopen(url) data = data.replace('"""', '"') # weird triple quotes data = data.splitlines() fieldnames = [ 'last_name', 'first_name', 'party', 'district', 'address', 'city', 'state', 'zip' ] csv_parser = csv.DictReader(data, fieldnames) district_leg_urls = self._district_legislator_dict() # Toss the row headers. next(csv_parser) for entry in csv_parser: if not entry: continue # City. entry['city'] = entry['city'] # Address. entry['address'] = entry['address'] # District. district = entry['district'] hd_or_sd, district = district.split() del entry['district'] # Party. party_letter = entry['party'] party = {'D': 'Democratic', 'R': 'Republican'}[party_letter] entry['party'] = party del entry['party'] # Get full name properly capped. fullname = _fullname = '%s %s' % (entry['first_name'].capitalize(), entry['last_name'].capitalize()) city_lower = entry['city'].lower() # Get any info at the legislator's detail_url. detail_url = district_leg_urls[hd_or_sd][district] # Get the office. address = '\n'.join([ entry['address'], '%s, %s %s' % (entry['city'], entry['state'], entry['zip']) ]) office = dict(name='District Office', type='district', phone=None, fax=None, email=None, address=address) try: deets = self._scrape_details(detail_url) except NoDetails: self.logger.warning("No details found at %r" % detail_url) continue # Add the details and delete junk. entry.update(deets) del entry['first_name'], entry['last_name'] legislator = Legislator(term, chamber, district, fullname, party=party) legislator.update(entry) legislator.add_source(detail_url) legislator.add_source(url) legislator['url'] = detail_url office['phone'] = deets.get('phone') office['fax'] = deets.get('fax') legislator.add_office(**office) self.save_legislator(legislator)
def scrape_member(self, chamber, term, member_url): page = self.urlopen(member_url) root = lxml.html.fromstring(page) root.make_links_absolute(member_url) photo_url = root.xpath( '//div[starts-with(@class,"bioPicContainer")]/img/@src')[0] photo_url = root.xpath( '//div[starts-with(@class,"bioPicContainer")]/img/@src')[0] full_name = root.xpath( '//div[starts-with(@class,"bioPicContainer")]/img/@alt')[0] email = root.xpath('//a[contains(@href, "mailto")]/@href')[0] email = email.replace('mailto:', '') # if full_name == 'Frank A. Moran': district = root.xpath( '//div[@id="District"]//div[starts-with(@class,"widgetContent")]') if len(district): district = district[0].text_content().strip() district = clean_district(district) else: self.logger.warning( 'No district tab found for this hot garbage. Skipping.') return party = root.xpath('//span[@class="legislatorAffiliation"]/text()')[0] if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' else: party = 'Other' leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url, email=email) leg.add_source(member_url) # offices for dl in root.xpath('//dl[@class="address"]'): office_name = phone = fax = email = None address = [] for child in dl.getchildren(): text = child.text_content() if child.tag == 'dt': office_name = text else: if text.startswith('Phone:'): phone = text.strip('Phone: ') or None elif text.startswith('Fax:'): fax = text.strip('Fax: ') or None elif text.startswith('Email:'): pass else: address.append(text) # all pieces collected if 'District' in office_name: otype = 'district' else: otype = 'capitol' address = filter(None, [re.sub(r'\s+', ' ', s).strip() for s in address]) if address: leg.add_office(otype, office_name, phone=phone, fax=fax, address='\n'.join(address), email=None) self.save_legislator(leg)
def scrape(self, chamber, term): if chamber == 'upper': url = ('http://webserver.rilin.state.ri.us/Documents/Senators.xls') rep_type = 'Senator ' source_url = 'http://www.rilin.state.ri.us/senators/default.aspx' source_url_title_replacement = rep_type elif chamber == 'lower': url = ( 'http://webserver.rilin.state.ri.us/Documents/Representatives.xls' ) rep_type = 'Representative ' source_url = 'http://www.rilin.state.ri.us/representatives/default.aspx' source_url_title_replacement = 'Rep. ' self.urlretrieve(url, 'ri_leg.xls') wb = xlrd.open_workbook('ri_leg.xls') sh = wb.sheet_by_index(0) # This isn't perfect but it's cheap and better than using the # XLS doc as the source URL for all legislators. # 374: RI: legislator url leg_source_url_map = {} leg_page = lxml.html.fromstring(self.urlopen(source_url)) leg_page.make_links_absolute(source_url) for link in leg_page.xpath('//td[@class="ms-vb2"]'): leg_name = link.text_content().replace( source_url_title_replacement, '') leg_url = link.xpath("..//a")[0].attrib['href'] leg_source_url_map[leg_name] = leg_url for rownum in xrange(1, sh.nrows): d = {} for field, col_num in excel_mapping.iteritems(): d[field] = sh.cell(rownum, col_num).value dist = str(int(d['district'])) district_name = dist full_name = re.sub(rep_type, '', d['full_name']).strip() translate = { "Democrat": "Democratic", "Republican": "Republican", "Independent": "Independent" } homepage_url = None if full_name in leg_source_url_map.keys(): homepage_url = leg_source_url_map[full_name] kwargs = { "town_represented": d['town_represented'], "email": d['email'] } if homepage_url is not None: kwargs['url'] = homepage_url leg = Legislator(term, chamber, district_name, full_name, '', '', '', translate[d['party']], **kwargs) leg.add_office('district', 'Address', address=d['address']) leg.add_source(source_url) if homepage_url: leg.add_source(homepage_url) self.save_legislator(leg)
def scrape_legislator(self, chamber, term, url): # Initialize default values for legislator attributes. full_name = None party = None photo_url = None email = None capitol_address = None capitol_phone = None district = None district_address = None district_phone = None if chamber == 'upper': title_prefix = 'Senator ' elif chamber == 'lower': title_prefix = 'Representative ' else: title_prefix = '' santa_fe_area_code = '(505)' page = self.lxmlize(url) info_node = self.get_node( page, '//table[@id="MainContent_formViewLegislator"]') if info_node is None: raise ValueError('Could not locate legislator data.') district_node = self.get_node( info_node, './/a[@id="MainContent_formViewLegislator_linkDistrict"]') if district_node is not None: district = district_node.text.strip() name_node = self.get_node( page, './/span[@id="MainContent_formViewLegislatorName' '_lblLegislatorName"]') if name_node is not None: if name_node.text.strip().endswith(' Vacant'): self.warning("Found vacant seat for {} district {}; skipping".format(chamber, district)) return n_head, n_sep, n_party = name_node.text.rpartition(' - ') full_name = re.sub(r'^{}'.format(title_prefix), '', n_head.strip()) if '(D)' in n_party: party = 'Democratic' elif '(R)' in n_party: party = 'Republican' elif '(DTS)' in n_party: # decline to state = independent party = 'Independent' else: raise AssertionError('Unknown party {} for {}'.format( party, full_name)) photo_node = self.get_node( info_node, './/img[@id="MainContent_formViewLegislator_imgLegislator"]') if photo_node is not None: photo_url = photo_node.get('src') email_node = self.get_node( info_node, './/a[@id="MainContent_formViewLegislator_linkEmail"]') if email_node is not None and email_node.text: email = email_node.text.strip() capitol_address_node = self.get_node( info_node, './/span[@id="MainContent_formViewLegislator_lblCapitolRoom"]') if capitol_address_node is not None: capitol_address_text = capitol_address_node.text if capitol_address_text is not None: capitol_address = 'Room {} State Capitol\nSanta Fe, NM 87501'\ .format(capitol_address_text.strip()) capitol_phone_node = self.get_node( info_node, './/span[@id="MainContent_formViewLegislator_lblCapitolPhone"]') if capitol_phone_node is not None: capitol_phone_text = capitol_phone_node.text if capitol_phone_text: capitol_phone_text = capitol_phone_text.strip() area_code, phone = extract_phone_number(capitol_phone_text) if phone: capitol_phone = '{} {}'.format( area_code.strip() if area_code else santa_fe_area_code, phone) district_address_node = self.get_node( info_node, './/span[@id="MainContent_formViewLegislator_lblAddress"]') if district_address_node is not None: district_address = '\n'.join(district_address_node.xpath("text()")) office_phone_node = self.get_node( info_node, './/span[@id="MainContent_formViewLegislator_lblOfficePhone"]') home_phone_node = self.get_node( info_node, './/span[@id="MainContent_formViewLegislator_lblHomePhone"]') if office_phone_node is not None and office_phone_node.text: district_phone_text = office_phone_node.text elif home_phone_node is not None and home_phone_node.text: district_phone_text = home_phone_node.text else: district_phone_text = None if district_phone_text: d_area_code, d_phone = extract_phone_number(district_phone_text) district_phone = '{} {}'.format(d_area_code.strip(), d_phone) legislator = Legislator( term=term, chamber=chamber, district=district, full_name=full_name, party=party, photo_url=photo_url) if email: legislator['email'] = email legislator.add_source(url) legislator.add_office( 'district', 'District Office', address=district_address, phone=district_phone) legislator.add_office( 'capitol', 'Capitol Office', address=capitol_address, phone=capitol_phone, email=email) self.save_legislator(legislator)
def scrape(self, chamber, term): #the url for each rep is unfindable (by me) #and the parts needed to make it up do not appear in the html or js. #we can find basic information on the main rep page, and sponsor #info on a version of their indivdual page called using only their #sponsor ID (which we have to scrape from ALISON) #we can't get detailed information without another ID #which I have not been able to find. if chamber == 'upper': member_list_url = self._base_url + 'Senate/ALSenators.aspx' legislator_base_url = self._base_url + 'ALSenator.aspx' elif chamber == 'lower': member_list_url = self._base_url + 'House/ALRepresentatives.aspx' legislator_base_url = self._base_url + 'ALRepresentative.aspx' page = self.lxmlize(member_list_url) legislator_nodes = self.get_nodes( page, '//div[@class="container container-main"]/table/tr/td/input') legislator_url_template = legislator_base_url + '?OID_SPONSOR='\ '{oid_sponsor}&OID_PERSON={oid_person}' html_parser = HTMLParser.HTMLParser() for legislator_node in legislator_nodes: # Set identifiers internal to AlisonDB. # Have to do this to OID_SPONSOR because they don't know # how to HTML and I'm making links absolute out of convenience. try: oid_sponsor = legislator_node.attrib['longdesc'].split('/')[-1] oid_person = legislator_node.attrib['alt'] except KeyError: continue legislator_url = legislator_url_template.format( oid_sponsor=oid_sponsor, oid_person=oid_person) legislator_page = self.lxmlize(legislator_url) name_text = self.get_node( legislator_page, '//span[@id="ContentPlaceHolder1_lblMember"]').text_content()\ .encode('utf-8') # This just makes processing the text easier. name_text = name_text.lower() # Skip vacant seats. if 'vacant' in name_text: continue # Removes titles and nicknames. name = html_parser.unescape( re.sub(r'(?i)(representative|senator|' '".*")', '', name_text).strip().title()) # Assemble full name by reversing last name, first name format. name_parts = [x.strip() for x in name.split(',')] full_name = '{0} {1}'.format(name_parts[1], name_parts[0]) info_node = self.get_node( legislator_page, '//div[@id="ContentPlaceHolder1_TabSenator_body"]//table') district_text = self.get_node( info_node, './tr[2]/td[2]').text_content().encode('utf-8') if chamber == 'upper': district = district_text.replace('Senate District', '').strip() elif chamber == 'lower': district = district_text.replace('House District', '').strip() party_text = self.get_node( info_node, './tr[1]/td[2]').text_content().encode('utf-8') if not full_name.strip() and party_text == '()': self.warning( 'Found empty seat, for district {}; skipping'.format( district)) continue party = self._parties[party_text.strip()] phone_number_text = self.get_node( info_node, './tr[4]/td[2]').text_content().encode('utf-8') phone_number = phone_number_text.strip() fax_number_text = self.get_node( info_node, './tr[5]/td[2]').text_content().encode('utf-8') fax_number = fax_number_text.strip() suite_text = self.get_node( info_node, './tr[7]/td[2]').text_content().encode('utf-8') office_address = '{}\n11 S. Union Street\nMontgomery, AL 36130'\ .format(suite_text) email_text = self.get_node( info_node, './tr[11]/td[2]').text_content().encode('utf-8') email_address = email_text.strip() photo_url = self.get_node( legislator_page, '//input[@id="ContentPlaceHolder1_TabSenator_TabLeg_imgLEG"]' '/@src') #add basic leg info and main office legislator = Legislator(term=term, district=district, chamber=chamber, full_name=full_name, party=party, email=email_address, photo_url=photo_url) legislator.add_office('capitol', 'Capitol Office', address=office_address, phone=phone_number, fax=fax_number) #match rep to sponsor_id if possible ln, fn = name.split(',') self.add_committees(legislator_page, legislator, chamber, term) legislator.add_source(member_list_url) legislator.add_source(legislator_url) self.save_legislator(legislator)