def scrape(self, chamber, term): # Pennsylvania doesn't make member lists easily available # for previous sessions, unfortunately self.validate_term(term, latest_only=True) leg_list_url = legislators_url(chamber) with self.urlopen(leg_list_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(leg_list_url) for link in page.xpath("//a[contains(@href, '_bio.cfm')]"): full_name = link.text district = link.getparent().getnext().tail.strip() district = re.search("District (\d+)", district).group(1) party = link.text[-2] if party == 'R': party = 'Republican' elif party == 'D': party = 'Democratic' url = link.get('href') legislator = Legislator(term, chamber, district, full_name, party=party, url=url) legislator.add_source(leg_list_url) self.save_legislator(legislator)
def scrape_legislator(self, chamber, term, name, url): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) xpath = '//select[@name="sel_member"]/option[@selected]/text()' district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop().split()[1].strip().lstrip('0') party = page.xpath('//h2').pop().text_content() party = re.search(r'\((R|D|I)[ \-\]]', party).group(1) if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' elif party == 'I': party = 'Independent' photo_url = page.xpath( "//img[contains(@src, 'images/members/')]")[0].attrib['src'] leg = Legislator(term, chamber, district, name, party=party, photo_url=photo_url, url=url) leg.add_source(url) self.scrape_offices(leg, page) self.save_legislator(leg)
def scrape_upper(self, chamber, term): url = 'http://www.senate.michigan.gov/members/memberlist.htm' html = self.urlopen(url) doc = lxml.html.fromstring(html) for row in doc.xpath('//table[@width=550]/tr')[1:39]: # party, dist, member, office_phone, office_fax, office_loc party, dist, member, phone, fax, loc = row.getchildren() party = abbr[party.text] district = dist.text_content().strip() name = member.text_content().strip() if name == 'Vacant': self.info('district %s is vacant', district) continue leg_url = member.xpath('a/@href')[0] office_phone = phone.text office_fax = fax.text office_loc = loc.text leg = Legislator(term=term, chamber=chamber, district=district, full_name=name, party=party, url=leg_url) leg.add_office('capitol', 'Capitol Office', address=office_loc, fax=office_fax, phone=office_phone) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): term_slug = term[:-2] url = MEMBER_LIST_URL[chamber] % term_slug html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath('//table')[4].xpath('tr')[2:]: name, _, _, district, party = row.xpath('td') district = district.text party = {'D':'Democratic', 'R': 'Republican', 'I': 'Independent'}[party.text] leg_url = name.xpath('a/@href')[0] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith('*'): continue leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0] leg = Legislator(term, chamber, district, name, party=party, url=leg_url, photo_url=photo_url) leg.add_source(url) leg.add_source(leg_url) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) if chamber == 'upper': url = ('http://www.rilin.state.ri.us/Documents/Senators.xls') rep_type = 'Senator ' elif chamber == 'lower': url = ('http://www.rilin.state.ri.us/Documents/Representatives.xls') rep_type = 'Representative ' with self.urlopen(url) as senator_xls: with open('ri_senate.xls', 'w') as f: f.write(senator_xls) wb = xlrd.open_workbook('ri_senate.xls') sh = wb.sheet_by_index(0) for rownum in xrange(1, sh.nrows): d = {} for field, col_num in excel_mapping.iteritems(): d[field] = str(sh.cell(rownum, col_num).value) district_name = "District " + d['district'] full_name = re.sub(rep_type, '', d['full_name']).strip() leg = Legislator(term, chamber, district_name, full_name, '', '', '', d['party'], office_address=d['address'], town_represented=d['town_represented'], email=d['email']) leg.add_source(url) self.save_legislator(leg)
def scrape_details(self, chamber, term, leg_name, leg_link, role): try: url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link with self.urlopen(url) as details_page: details_page = details_page.decode('latin1').encode('utf8', 'ignore') root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) party = root.xpath('string(//party)') district = root.xpath('string(//district)') first_name, middle_name, last_name = "", "", "" home_phone = root.xpath('string(//h_phone)') bis_phone = root.xpath('string(//b_phone)') capital_phone = root.xpath('string(//cap_phone)') other_phone = root.xpath('string(//oth_phone)') org_info = root.xpath('string(//org_info)') email_name = root.xpath('string(//email_address)') email = '%s@%s.ms.gov' % (email_name, chamber) if party == 'D': party = 'Democratic' else: party = 'Republican' leg = Legislator(term, chamber, district, leg_name, first_name, last_name, middle_name, party, role=role, home_phone = home_phone, bis_phone=bis_phone, capital_phone=capital_phone, other_phone=other_phone, org_info=org_info, email=email, url=url) leg.add_source(url) self.save_legislator(leg) except scrapelib.HTTPError, e: self.warning(str(e))
def scrape_details(self, chamber, term, leg_name, leg_link, role): if not leg_link: # Vacant post, likely: if "Vacancy" in leg_name: return raise Exception("leg_link is null. something went wrong") try: url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link url_root = os.path.dirname(url) details_page = self.urlopen(url) root = lxml.etree.fromstring(details_page.bytes) party = root.xpath('string(//PARTY)') district = root.xpath('string(//DISTRICT)') photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)')) home_phone = root.xpath('string(//H_PHONE)') bis_phone = root.xpath('string(//B_PHONE)') capital_phone = root.xpath('string(//CAP_PHONE)') other_phone = root.xpath('string(//OTH_PHONE)') org_info = root.xpath('string(//ORG_INFO)') email_name = root.xpath('string(//EMAIL_ADDRESS)') cap_room = root.xpath('string(//CAP_ROOM)') if party == 'D': party = 'Democratic' else: party = 'Republican' leg = Legislator(term, chamber, district, leg_name, party=party, role=role, org_info=org_info, url=url, photo_url=photo) leg.add_source(url) kwargs = {} if email_name.strip() != "": email = '%s@%s.ms.gov' % (email_name, { "upper": "senate", "lower": "house" }[chamber]) kwargs['email'] = email if capital_phone != "": kwargs['phone'] = capital_phone if cap_room != "": kwargs["address"] = "Room %s\n%s" % (cap_room, CAP_ADDRESS) else: kwargs['address'] = CAP_ADDRESS leg.add_office('capitol', 'Capitol Office', **kwargs) self.save_legislator(leg) except scrapelib.HTTPError, e: self.warning(str(e))
def scrape_reps(self, chamber, term): # There are 99 House districts for district in xrange(1, 100): rep_url = "http://www.house.state.oh.us/components/" "com_displaymembers/page.php?district=%d" % district with self.urlopen(rep_url) as page: page = lxml.html.fromstring(page) for el in page.xpath('//table[@class="page"]'): rep_link = el.xpath("tr/td/title")[0] full_name = rep_link.text party = full_name[-2] full_name = full_name[0:-3] if full_name == "Vacant Posit": continue if party == "D": party = "Democratic" elif party == "R": party = "Republican" leg = Legislator(term, chamber, str(district), full_name, party=party, url=rep_url) leg.add_source(rep_url) self.save_legislator(leg)
def scrape_upper(self, term): url = "http://www.nysenate.gov/senators" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) xpath = ( '//div[contains(@class, "views-row")]/' 'div[contains(@class, "last-name")]/' 'span[contains(@class, "field-content")]/a') for link in page.xpath(xpath): if link.text in (None, 'Contact', 'RSS'): continue name = link.text.strip() district = link.xpath("string(../../../div[3]/span[1])") district = re.match(r"District (\d+)", district).group(1) photo_link = link.xpath("../../../div[1]/span/a/img")[0] photo_url = photo_link.attrib['src'] legislator = Legislator(term, 'upper', district, name, party="Unknown", photo_url=photo_url) legislator.add_source(url) contact_link = link.xpath("../span[@class = 'contact']/a")[0] contact_url = contact_link.attrib['href'] self.scrape_upper_offices(legislator, contact_url) legislator['url'] = contact_url.replace('/contact', '') self.save_legislator(legislator)
def scrape_2011Leg(self, chamber, term, url): """2011 Scraper for legislators""" parties = {'(D)': 'Democratic', '(R)': 'Republican'} with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) table = page.xpath('//table[contains(@id, "GridView1")]')[0] for row in table.xpath('tr[td/a[contains(@href, "memberpage")]]'): params = {} district = row.xpath('td/span[contains(@id, "LabelDistrict")]/font')[0].text last_name_a = row.xpath('td/a[contains(@id, "HyperLinkLast")]')[0] member_url = last_name_a.get('href') last_name = last_name_a.text_content().strip() first_names = row.xpath('td/span[contains(@id, "LabelFirst")]/font')[0].text.strip() first_name = first_names.split()[0] middle_name = ' '.join(first_names.split()[1:]) party = row.xpath('td/span[contains(@id, "LabelParty")]/font')[0].text party = parties[party] params['office_address'] = row.xpath('td/span[contains(@id, "LabelRoom")]')[0].text + \ " " + row.xpath('td/span[contains(@id, "LabelRoom2")]')[0].text params['photo_url'] = row.xpath('td/a[contains(@id, "HyperLinkChairJPG")]/img')[0].attrib['src'] params['email'] = row.xpath('td/a[contains(@id, "HyperLinkEmail")]')[0].text params['phone'] = row.xpath('td/span[contains(@id, "LabelPhone2")]')[0].text full_name = first_names + " " + last_name leg = Legislator(term, chamber, district, full_name, first_name, last_name, middle_name, party, url=member_url, **params) leg.add_source(url) self.save_legislator(leg)
def _scrape_speaker_of_the_house(self, url, term, chamber): """The speaker of the house has a special page, because he is just OH so special</sarcasm> Main page url like: http://www1.legis.ga.gov/legis/2011_12/house/speaker/index.htm but need to scrape: http://www1.legis.ga.gov/legis/2011_12/house/speaker/bio.html """ if url.endswith("index.htm"): url = url.replace("index.htm", "bio.html") with self.lxml_context(url) as page: path = '//div[@id="title"]' speaker_info_div = page.xpath(path) if speaker_info_div and len(speaker_info_div) == 1: # This isn't exactly great but it's the best/quickest solution for now speaker_info = speaker_info_div[0].text_content().split() name = speaker_info[2] + " " + speaker_info[3] party = None if "R-" in speaker_info[4]: party = "Republican" elif "D-" in speaker_info[4]: party = "Democrat" elif "I-" in speaker_info[4]: party = "Independent" district = None if "district" in speaker_info[6].lower(): district = speaker_info[7].strip(")") legislator = Legislator(term, chamber, district, name, party=party) legislator.add_source(url) return legislator
def scrape_rep(self, name, term, url): # special case names that confuses name_tools if name == "Franklin, A.B.": name = "Franklin, A. B." elif ", Jr., " in name: name.replace(", Jr., ", " ") name += ", Jr." elif ", III, " in name: name.replace(", III, ", " ") name += ", III" with self.urlopen(url) as text: page = lxml.html.fromstring(text) district = page.xpath("//a[contains(@href, 'Maps')]")[0].attrib["href"] district = re.search("district(\d+).pdf", district).group(1) if "Democrat District" in text: party = "Democratic" elif "Republican District" in text: party = "Republican" elif "Independent District" in text: party = "Independent" else: party = "Other" leg = Legislator(term, "lower", district, name, party=party) leg.add_source(url) self.save_legislator(leg)
def scrape_senator(self, name, term, url): with self.urlopen(url) as text: page = lxml.html.fromstring(text) district = page.xpath( "string(//*[starts-with(text(), 'Senator ')])") district = re.search(r'District (\d+)', district).group(1) try: party = page.xpath( "//b[contains(text(), 'Party')]")[0].getnext().tail party = party.strip() except IndexError: party = 'N/A' if party == 'No Party (Independent)': party = 'Independent' elif party == 'Democrat': party = 'Democratic' leg = Legislator(term, 'upper', district, name, party=party, url=url) leg.add_source(url) self.save_legislator(leg)
def scrape_member(self, chamber, term, member_url): with self.urlopen(member_url) as page: root = lxml.html.fromstring(page) root.make_links_absolute(member_url) photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0] photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0] full_name = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@alt')[0] email = root.xpath('//a[contains(@href, "mailto")]/@href')[0] email = email.replace('mailto:','') district = root.xpath('//div[@id="District"]//div[starts-with(@class,"widgetContent")]') if len(district): district = district[0].text.strip() district = clean_district(district) party = root.xpath('//span[@class="legislatorAffiliation"]/text()')[0] if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' else: party = 'Other' leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url, email=email) leg.add_source(member_url) self.save_legislator(leg)
def scrape(self, chamber, term): if chamber == 'lower': url = 'http://www.scstatehouse.gov/html-pages/housemembers.html' else: url = 'http://www.scstatehouse.gov/html-pages/senatemembersd.html' with self.urlopen(url) as data: doc = lxml.html.fromstring(data) rows = doc.xpath('//pre/div[@class="sansSerifNormal"]') for row in rows: member_a = row.xpath('a')[0] name_party = member_a.text_content() if name_party.find('[D]') != -1: party = 'Democratic' full_name = name_party.partition('[D]')[0].strip() elif name_party.find('[R]') != -1: party = 'Republican' full_name = name_party.partition('[R]')[0].strip() photo_url = 'http://www.scstatehouse.gov/members/gif/' + re.search('(\d+)\.html', member_a.attrib['href']).group(1) + '.jpg' other_data = row.text_content().encode('ascii', 'ignore') od_result = re.search('^.+District (\d+) - (.+)Count.+$', other_data) district = od_result.group(1) contentb = re.search('^.+\(C\) (.+,.*\d+).*Bus. (\(\d+\) \d+-\d+).+$', other_data) if contentb is not None: office_address = contentb.group(1) office_phone = contentb.group(2) legislator = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, office_address=office_address, office_phone=office_phone) legislator.add_source(url) self.save_legislator(legislator)
def scrape_upper(self, term): url = "http://oksenate.gov/Senators/Default.aspx" html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for a in doc.xpath('//table[@summary]')[1].xpath('.//td//a[contains(@href, "biographies")]'): name, party = a.text.rsplit(None, 1) if party == '(D)': party = 'Democratic' elif party == '(R)': party = 'Republican' tail = a.xpath('..')[0].tail if tail: district = tail.split()[1] else: district = a.xpath('../../span')[1].text.split()[1] url = a.get('href') leg = Legislator(term, 'upper', district, name, party=party, url=url) leg.add_source(url) self.scrape_upper_offices(leg, url) self.save_legislator(leg)
def scrape_lower(self, term): url = "http://assembly.state.ny.us/mem/?sh=email" with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for link, email in zip( page.xpath("//a[contains(@href, '/mem/')]"), page.xpath("//a[contains(@href, 'mailto')]") ): name = link.text.strip() if name == "Assembly Members": continue # empty seats if "Assembly District" in name: continue leg_url = link.get("href") district = link.xpath("string(../following-sibling::" "div[@class = 'email2'][1])") district = district.rstrip("rthnds") legislator = Legislator(term, "lower", district, name, party="Unknown", url=leg_url) legislator.add_source(url) email = email.text_content().strip() if email: legislator["email"] = email self.save_legislator(legislator)
def scrape_senate(self, term): urls = ( 'http://www.senadopr.us/senadores/Pages/Senadores%20Acumulacion.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20I.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20II.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20III.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20IV.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20V.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VI.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VII.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VIII.aspx') for counter, url in enumerate(urls): leg_page_html = self.urlopen(url) doc = lxml.html.fromstring(leg_page_html) doc.make_links_absolute(url) table = doc.xpath('//table[@summary="Listado de Senadores"]')[0] # skip first row for row in table.xpath('tr')[1:]: tds = row.xpath('td') name = tds[0].text_content().title().replace('Hon.','',1).strip() party = tds[1].text_content() phone = tds[2].text_content() email = tds[3].text_content() #shapefiles denote 0 as At-Large Districts if counter == 0: district = 'At-Large' else: district = str(counter) #Code to guess the picture namefixed = unicode(name.replace(".",". ")) #Those middle names abbreviations are sometimes weird. namefixed = unicodedata.normalize('NFKD', namefixed).encode('ascii', 'ignore') #Remove the accents nameparts = namefixed.split() if nameparts[1].endswith('.'): lastname = nameparts[2] else: lastname = nameparts[1] # Construct the photo url picture_filename = 'http://www.senadopr.us/Fotos%20Senadores/sen_' + (nameparts[0][0] + lastname).lower() + '.jpg' try: picture_data = self.urlopen(picture_filename): # Checking to see if the file is there leg = Legislator(term, 'upper', district, name, party=party, email=email, url=url, photo_url=picture_filename) except scrapelib.HTTPError: # If not, leave out the photo_url leg = Legislator(term, 'upper', district, name, party=party, phone=phone, email=email, url=url) leg.add_office('capitol', 'Oficina del Capitolio', phone=phone) leg.add_source(url) self.save_legislator(leg)
def scrape_upper(self, term): url = "http://www.nysenate.gov/senators" with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath('//a[contains(@href, "/senator/")]'): if link.text in (None, "Contact", "RSS"): continue name = link.text.strip() district = link.xpath("string(../../../div[3]/span[1])") district = re.match(r"District (\d+)", district).group(1) photo_link = link.xpath("../../../div[1]/span/a/img")[0] photo_url = photo_link.attrib["src"] legislator = Legislator(term, "upper", district, name, party="Unknown", photo_url=photo_url) legislator.add_source(url) contact_link = link.xpath("../span[@class = 'contact']/a")[0] contact_url = contact_link.attrib["href"] self.scrape_upper_contact_info(legislator, contact_url) legislator["url"] = contact_url.replace("/contact", "") self.save_legislator(legislator)
def scrape_rep(self, name, term, url): # special case names that confuses name_tools if name == 'Franklin, A.B.': name = 'Franklin, A. B.' elif ', Jr., ' in name: name = name.replace(', Jr., ', ' ') name += ', Jr.' elif ', III, ' in name: name = name.replace(', III, ', ' ') name += ', III' with self.urlopen(url) as text: page = lxml.html.fromstring(text) district = page.xpath( "//a[contains(@href, 'district')]")[0].attrib['href'] district = re.search("district(\d+).pdf", district).group(1) if "Democrat District" in text: party = "Democratic" elif "Republican District" in text: party = "Republican" elif "Independent District" in text: party = "Independent" else: party = "Other" leg = Legislator(term, 'lower', district, name, party=party, url=url) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term) if chamber == 'upper': url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=senate" else: url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=assembly" with self.urlopen(url) as body: page = lxml.html.fromstring(body) for row in page.cssselect("#ctl00_C_dgLegData tr"): if len(row.cssselect("td a")) > 0: rep_url = list(row)[0].cssselect("a[href]")[0].get("href") rep_url = 'http://legis.wi.gov/w3asp/contact/' + rep_url legpart = re.findall(r'([\w\-\,\s\.]+)\s+\(([\w])\)', list(row)[0].text_content()) if legpart: full_name, party = legpart[0] # skip if the legislator is vacant (occurred in 2011 session) if full_name == 'Vacant': continue party = PARTY_DICT[party] district = str(int(list(row)[2].text_content())) leg = Legislator(term, chamber, district, full_name, party=party, url=rep_url) leg.add_source(rep_url) leg = self.add_committees(leg, rep_url, term, chamber) self.save_legislator(leg)
def scrape_lower(self, term): url = "http://le.utah.gov/house2/representatives.jsp" html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath("//tr")[1:]: tds = row.xpath("td") district = tds[0].text_content() if tds[1].text_content() == "Empty": self.log("district %s is empty" % district) continue a = tds[1].xpath("a")[0] name = a.text_content() leg_url = a.get("href") party = tds[2].text_content() if party == "D": party = "Democratic" elif party == "R": party = "Republican" else: raise ValueError("unknown party") # get photo leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) photo_url = leg_doc.xpath('//img[@alt="photo"]/@src')[0] leg = Legislator(term, "lower", district, name, party=party, photo_url=photo_url, url=leg_url) leg.add_source(url) leg.add_source(leg_url) self.save_legislator(leg)
def scrape_legislator_data(self, url, chamber): party_fulls = {'R' : 'Republican', 'D' : 'Democrat'} with self.urlopen(url) as page: page = BeautifulSoup(page) for data in page.find('table', id = 'ctl00_mainCopy_DataList1')('td'): spans = data('span') if len(spans) == 0: self.debug('Found an empty cell in %s. Continuing' % url) continue full_name = ' '.join([span.string.strip() for span in spans]) if len(spans[0].string.strip().split()) == 2: first_name, middle_name = spans[0].string.strip().split() else: first_name, middle_name = spans[0].string.strip(), '' last_name = spans[1].string.strip() details_url = get_abs_url(url, data.find('a')['href']) with self.urlopen(details_url) as details: details = BeautifulSoup(details) district = details.find('a', id = 'ctl00_mainCopy_LegisInfo_DISTRICTLabel').string.strip() party = party_fulls[details.find('span', id = 'ctl00_mainCopy_LegisInfo_PARTYLabel').string] leg = Legislator('2010', chamber, district, full_name, first_name, last_name, middle_name, party) leg.add_source(details_url) comms_table = details.find('table', id = 'ctl00_mainCopy_MembershipGrid') for comms_raw_data in comms_table('tr')[1:]: comm_data = comms_raw_data('td') comm_role_type = comm_data[0].string.strip() comm_name = comm_data[1]('a')[0].string.strip() leg.add_role(comm_role_type, '2010', chamber = chamber, committee = comm_name) self.save_legislator(leg)
def scrape_2011Leg(self, chamber, term, url): """2011 Scraper for legislators""" titles = {'lower': 'Representative', 'upper': 'Senator'} parties = {'D': 'Democrat', 'R': 'Republican'} with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) table = page.xpath('//table[contains(@id, "GridView1")]')[0] for row in table.xpath('tr[td/a[contains(@href, "memberpage")]]'): params = {} district = row.xpath('td/span[contains(@id, "LabelDis")]/font')[0].text + " " + \ row.xpath('td/span[contains(@id, "LabelDistrict2")]/font')[0].text # Replace any / in district name to allow json file to save. district = district.replace('/', '-') params['title'] = titles.get(chamber, '') last_name = row.xpath('td/a[contains(@id, "HyperLinkLast")]/font')[0].text.strip() first_names = row.xpath('td/span[contains(@id, "LabelFirst")]/font')[0].text.strip() first_name = first_names.split()[0] middle_name = ' '.join(first_names.split()[1:]) party = row.xpath('td/span[contains(@id, "LabelParty")]/font')[0].text party = party.replace('(', '') party = party.replace(')', '') party = parties.get(party, '') # Expand party from initial letter. params['office_address'] = row.xpath('td/span[contains(@id, "LabelRoom")]')[0].text + \ " " + row.xpath('td/span[contains(@id, "LabelRoom2")]')[0].text params['photo_url'] = row.xpath('td/a[contains(@id, "HyperLinkChairJPG")]/img')[0].attrib['src'] params['email'] = row.xpath('td/a[contains(@id, "HyperLinkEmail")]')[0].text params['phone'] = row.xpath('td/span[contains(@id, "LabelPhone2")]')[0].text full_name = first_names + " " + last_name leg = Legislator(term, chamber, district, full_name, first_name, last_name, middle_name, party, **params) leg.add_source(url) self.save_legislator(leg)
def scrape_legislator(self, chamber, term, name, url): with self.urlopen(url) as page: # Alaska fails at unicode, some of the pages have broken # characters. They're not in data we care about so just # replace them. page = page.decode('utf8', 'replace') page = lxml.html.fromstring(page) name = re.sub(r'\s+', ' ', name) info = page.xpath('string(//div[@id = "fullpage"])') district = re.search(r'District ([\w\d]+)', info).group(1) party = re.search(r'Party: (.+) Toll-Free', info).group(1).strip() email = re.search(r'Email: ([\w_]+@legis\.state\.ak\.us)', info).group(1) # for consistency if party == 'Democrat': party = 'Democratic' leg = Legislator(term, chamber, district, name, party=party, email=email, url=url) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): if chamber == 'upper': url = ('http://webserver.rilin.state.ri.us/Documents/Senators.xls') rep_type = 'Senator ' elif chamber == 'lower': url = ( 'http://webserver.rilin.state.ri.us/Documents/Representatives.xls') rep_type = 'Representative ' self.urlretrieve(url, 'ri_leg.xls') wb = xlrd.open_workbook('ri_leg.xls') sh = wb.sheet_by_index(0) for rownum in xrange(1, sh.nrows): d = {} for field, col_num in excel_mapping.iteritems(): d[field] = sh.cell(rownum, col_num).value dist = str(int(d['district'])) district_name = dist full_name = re.sub(rep_type, '', d['full_name']).strip() translate = { "Democrat" : "Democratic", "Republican" : "Republican", "Independent" : "Independent" } leg = Legislator(term, chamber, district_name, full_name, '', '', '', translate[d['party']], town_represented=d['town_represented'], email=d['email']) leg.add_office('district', 'Address', address=d['address']) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, session): url = self.get_district_list(chamber, session) people_pages = self.scrape_directory( url, chamber, session ) for person in people_pages: district = person p_url = people_pages[district] metainf = self.process_person( p_url ) p = Legislator( session, chamber, district, metainf['name'], party=metainf['party'], # some additional things the website provides: occupation=metainf['occupation'], photo_url=metainf['photo_url'], url=metainf['homepage']) if "email" in metainf: p['email'] = metainf['email'] if "number" in metainf: p.add_office('capitol', 'Capitol Office', phone=metainf['number'], address='200 E. Colfax\nDenver, CO 80203' ) p.add_source( p_url ) if 'ctty' in metainf: for ctty in metainf['ctty']: p.add_role( 'committee member', term=session, chamber=chamber, committee=clean_committee(ctty), position="member" ) self.save_legislator( p )
def scrape(self, chamber, session): metainf = self.scrape_leg_page(get_chamber_listing_url( chamber )) for leg in metainf: p = Legislator( session, chamber, leg['district'], leg['name'], party=leg['party'], # some additional things the website provides: photo_url=leg['image'], url=leg['homepage'], room=leg['room'], phone=leg['phone'], fax=leg['fax'], email=leg['email'], address=leg['addr']) for source in leg['source']: p.add_source( source ) try: for ctty in leg['ctty']: flag='Joint Legislative' if ctty['name'][:len(flag)] == flag: ctty_chamber = "joint" else: ctty_chamber = chamber p.add_role( 'committee member', term=session, chamber=ctty_chamber, committee=ctty['name'], position="member") except KeyError: self.log( "XXX: Warning, %s has no scraped Commities" % leg['name'] ) self.save_legislator( p )
def scrape_lower(self, term): url = "http://www.okhouse.gov/Members/Default.aspx" page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) for tr in page.xpath("//table[@class='rgMasterTable']/tbody/tr")[1:]: name = tr.xpath('.//td[1]/a')[0].text.strip() district = tr.xpath('.//td[3]')[0].text_content().strip() party = tr.xpath('.//td[4]')[0].text_content().strip() party = {'R': 'Republican', 'D': 'Democratic'}[party] leg_url = 'http://www.okhouse.gov/District.aspx?District=' + district leg_doc = lxml.html.fromstring(self.urlopen(leg_url)) leg_doc.make_links_absolute(leg_url) photo_url = leg_doc.xpath('//a[contains(@href, "HiRes")]/@href')[0] if name.startswith('House District'): self.warning("skipping %s %s" % (name, leg_url)) continue leg = Legislator(term, 'lower', district, name, party=party, photo_url=photo_url, url=leg_url) leg.add_source(url) leg.add_source(leg_url) # Scrape offices. self.scrape_lower_offices(leg_doc, leg) self.save_legislator(leg)
def scrape(self, chamber, session): url = self.get_district_list(chamber, session) people_pages = self.scrape_directory( url, chamber, session ) for person in people_pages: district = person p_url = people_pages[district] metainf = self.process_person( p_url ) p = Legislator( session, chamber, district, metainf['name'], party=metainf['party'], # some additional things the website provides: occupation=metainf['occupation'], photo_url=metainf['photo_url'], url=metainf['homepage']) phone = metainf['number'] if 'number' in metainf else None email = metainf['email'] if 'email' in metainf else None p.add_office('capitol', 'Capitol Office', phone=phone, address='200 E. Colfax\nDenver, CO 80203', email=email ) p.add_source( p_url ) self.save_legislator( p )
def scrape_page(self, chamber, term, url): page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[contains(concat(' ', " "normalize-space(@class), ' '), ' memberModule ')]"): img = legislator.xpath( ".//div[@class='thumbnail']//img")[0].attrib['src'] data = legislator.xpath(".//div[@class='data']")[0] homepage = data.xpath(".//a[@class='black']")[0] full_name = homepage.text_content() homepage = homepage.attrib['href'] party = data.xpath( ".//span[@class='partyLetter']")[0].text_content() party = {"R": "Republican", "D": "Democratic"}[party] office_lines = data.xpath("child::text()") phone = office_lines.pop(-1) office = "\n".join(office_lines) h3 = data.xpath("./h3") if len(h3): h3 = h3[0] district = h3.xpath("./br")[0].tail.replace("District", "").strip() else: district = re.findall("\d+\.png", legislator.attrib['style'])[-1].split( ".", 1)[0] full_name = re.sub("\s+", " ", full_name).strip() leg = Legislator(term, chamber, district, full_name, party=party, url=homepage, photo_url=img) leg.add_office('capitol', 'Capitol Office', address=office, phone=phone) self.scrape_homepage(leg, chamber, homepage, term) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, session): metainf = self.scrape_leg_page(get_legislator_listing_url(chamber)) for leg in metainf: try: chamber = {"House": "lower", "Senate": "upper"}[leg['chamber']] except KeyError: print("") print(" ERROR: Bad Legislator page.") print(" -> " + "\n -> ".join(leg['source'])) print("") print(" Added this workaround because of a bad legislator") print(" page, while they filled their info out.") print("") print(" Emailed webmaster. Told to wait.") print(" - PRT, Jun 23, 2014") print("") continue p = Legislator( session, chamber, leg['district'], leg['name'], party=leg['party'], # some additional things the website provides: photo_url=leg['image'], url=leg['homepage'], email=leg['email']) p.add_office('capitol', 'Capitol Office', address=leg['addr'], phone=leg['phone'], fax=leg['fax'] or None) for source in leg['source']: p.add_source( source ) try: for ctty in leg['ctty']: flag='Joint Legislative' if ctty['name'][:len(flag)] == flag: ctty_chamber = "joint" else: ctty_chamber = chamber p.add_role( 'committee member', term=session, chamber=ctty_chamber, committee=ctty['name'], position="member") except KeyError: self.log( "XXX: Warning, %s has no scraped Commities" % leg['name'] ) self.save_legislator( p )
def scrape(self, chamber, term): """ Scrapes legislators for the current term only """ self.validate_term(term, latest_only=True) url = BASE_URL % CHAMBERS[chamber].lower() index = self.get(url).text html = lxml.html.fromstring(index) html.make_links_absolute(url) rows = html.xpath('//div[contains(@class, "row-equal-height")]') for row in rows: img_url = row.xpath('.//img/@src')[0] inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1] name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip() name = re.sub('\s+', ' ', name) party = PARTY[inner.xpath('p/strong')[0].tail.strip()] email = inner.xpath('p/strong/a')[0].text district = inner.xpath('p/a')[0].text.replace('District ', '') leg_url = inner.xpath('p/a/@href')[0] leg = Legislator(term, chamber, district, name, party=party, email=email) phones = get_phones(inner) leg.add_office('district', 'District Office', address=get_address(inner), fax=get_fax(inner), phone=phones.get('home') or phones.get('business')) leg.add_office('capitol', 'Capitol Office', phone=phones.get('office')) leg.add_source(url) leg['photo_url'] = img_url leg['url'] = leg_url for com in inner.xpath('p/a[contains(@href, "committees")]'): role = com.tail.strip() if not role: role = 'member' leg.add_role('committee member', term=term, chamber=chamber, committee=com.text, position=role) self.save_legislator(leg)
def scrape_senate(self, term): index_url = 'http://www.senate.mn/members/index.php' doc = lxml.html.fromstring(self.urlopen(index_url)) doc.make_links_absolute(index_url) leg_data = defaultdict(dict) # get all the tds in a certain div tds = doc.xpath('//div[@id="hide_show_alpha_all"]//td[@style="vertical-align:top;"]') for td in tds: # each td has 2 <a>s- site & email main_link, email = td.xpath('.//a') # get name name = main_link.text_content().split(' (')[0] leg = leg_data[name] leg['leg_url'] = main_link.get('href') leg['photo_url'] = td.xpath('./preceding-sibling::td/a/img/@src')[0] if 'mailto:' in email.get('href'): leg['email'] = email.get('href').replace('mailto:', '') self.info('collected preliminary data on %s legislators', len(leg_data)) assert leg_data # use CSV for most of data csv_url = 'http://www.senate.mn/members/member_list_ascii.php?ls=' csvfile = self.urlopen(csv_url) for row in csv.DictReader(StringIO(csvfile)): if not row['First Name']: continue name = '%s %s' % (row['First Name'], row['Last Name']) party = self._parties[row['Party']] leg = Legislator(term, 'upper', row['District'].lstrip('0'), name, party=party, first_name=row['First Name'], last_name=row['Last Name'], **leg_data[name] ) leg.add_office('capitol', 'Capitol Office', address='{Address}\n{Address2}\n{City}, {State} {Zipcode}'.format(**row) ) leg.add_source(csv_url) leg.add_source(index_url) self.save_legislator(leg)
def scrape(self, term, chambers): year_slug = term[5: ] # Load all members via the private API legislator_dump_url = \ 'http://legislature.vermont.gov/people/loadAll/{}'.\ format(year_slug) json_data = self.urlopen(legislator_dump_url) legislators = json.loads(json_data)['data'] # Parse the information from each legislator for info in legislators: # Strip whitespace from strings info = { k:v.strip() for k, v in info.iteritems() } leg = Legislator( term=term, chamber=('upper' if info['Title'] == 'Senator' else 'lower'), district=info['District'].replace(" District", ""), party=info['Party'], email=info['Email'], full_name="{0}{1} {2}".format( info['FirstName'], (" " + info['MI'] if info['MI'] else ""), info['LastName'] ), photo_url= 'http://legislature.vermont.gov/assets/Documents/Legislators/{}.jpg'. format(info['Email'][ :-(len("@leg.state.vt.us"))] ) ) leg.add_source(legislator_dump_url) leg.add_office( type='district', name='District Office', address="{0}{1}\n{2}, {3} {4}".format( info['MailingAddress1'], ("\n" + info['MailingAddress2'] if info['MailingAddress2'] else ""), info['MailingCity'], info['MailingState'], info['MailingZIP'] ), phone=(info['HomePhone'] if info['HomePhone'] else None), email=(info['HomeEmail'] if info['HomeEmail'] else None) ) self.save_legislator(leg)
def scrape_house(self, term): url = 'http://www.house.leg.state.mn.us/members/housemembers.asp' html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # skip first header row for row in doc.xpath('//tr')[1:]: tds = [td.text_content().strip() for td in row.xpath('td')] if len(tds) == 5: district = tds[0].lstrip('0') name, party = tds[1].rsplit(' ', 1) if party == '(R)': party = 'Republican' elif party == '(DFL)': party = 'Democratic-Farmer-Labor' leg_url = row.xpath('td[2]/p/a/@href')[0] addr = tds[2] phone = tds[3] email = tds[4] leg = Legislator(term, 'lower', district, name, party=party, email=email, url=leg_url) addr = ('{0} State Office Building\n' '100 Rev. Dr. Martin Luther King Jr. Blvd.\n' 'St. Paul, MN 55155').format(addr) leg.add_office('capitol', 'Capitol Office', address=addr, phone=phone) # add photo_url leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) img_src = leg_doc.xpath('//img[contains(@src, "memberimg")]/@src') if img_src: leg['photo_url'] = img_src[0] leg.add_source(url) leg.add_source(leg_url) self.save_legislator(leg)
def scrape(self, chamber, session): # All other years are stored in a pdf # http://www.capitol.hawaii.gov/session2009/misc/statehood.pdf if year_from_session(session) != 2009: raise NoDataForPeriod(session) if chamber == 'upper': legislators_page_url = BASE_URL + "/site1/info/direct/sendir.asp" else: legislators_page_url = BASE_URL + "/site1/info/direct/repdir.asp" with self.urlopen(legislators_page_url) as legislators_page_html: legislators_page = lxml.html.fromstring(legislators_page_html) # get all rows (except first) of first table legislators_data = legislators_page.xpath('//table[1]/tr')[1:] # group legislator data in sets of 3 legislators_data = grouper(3, legislators_data) for name_and_party, district, email in legislators_data: element, attribute, link, pos = name_and_party.iterlinks().next() source = BASE_URL + link name_and_party = name_and_party.cssselect('td') name_and_party = name_and_party[0] name, sep, party = name_and_party.text_content().partition("(") # remove space at the beginning name = name.strip() if party == 'R)': party = 'Republican' else: party = 'Democratic' district = district.cssselect('td') district = district[1] district = district.text_content() email = email.cssselect('a') email = email[0] email = email.text_content() # Remove white space email = email.strip() leg = Legislator(session, chamber, district, name, party=party, official_email=email) leg.add_source(source) self.save_legislator(leg)
def scrape(self, chamber, term): urls = {'lower': "http://www.msa.md.gov/msa/mdmanual/06hse/html/hseal.html", 'upper': "http://www.msa.md.gov/msa/mdmanual/05sen/html/senal.html"} detail_re = re.compile('\((R|D)\), (?:Senate President, )?(?:House Speaker, )?District (\w+)') self.validate_term(term, latest_only=True) with self.urlopen(urls[chamber]) as html: doc = lxml.html.fromstring(html) # data on this page is <li>s that have anchor tags for a in doc.cssselect('li a'): link = a.get('href') # tags don't close so we get the <li> and <a> content and diff them name_text = a.text_content() detail_text = a.getparent().text_content().replace(name_text, '') # ignore if it is not a valid link if link: # handle names names = name_text.split(',') last_name = names[0] first_name = names[1].strip() # TODO: try to trim first name to remove middle initial if len(names) > 2: suffixes = names[2] else: suffixes = '' # handle details details = detail_text.strip() party, district = detail_re.match(details).groups() party = PARTY_DICT[party] leg = Legislator(term, chamber, district, ' '.join((first_name, last_name)), first_name, last_name, '', party, suffixes=suffixes) leg_url = BASE_URL+link leg.add_source(url=leg_url) with self.urlopen(leg_url) as leg_html: leg_doc = lxml.html.fromstring(leg_html) img_src = leg_doc.xpath('//img[@align="left"]/@src') if img_src: leg['photo_url'] = BASE_URL + img_src[0] self.save_legislator(leg)
def scrape_legislators(self, term, chamber, leg_page, member_url, main_url): full_name = leg_page.xpath( '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[1]/td/h2' )[0].text if len(full_name.split()) == 3: first_name = full_name.split()[1] middle_name = '' last_name = full_name.split()[2] full_name = first_name + ' ' + last_name else: first_name = full_name.split()[1] middle_name = full_name.split()[2] last_name = full_name.split()[3] full_name = first_name + ' ' + middle_name + ' ' + last_name district = leg_page.xpath( '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[5]/td[2]' )[0].text party = leg_page.xpath( '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[6]/td[2]' )[0].text full_address = leg_page.xpath( '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[2]/td[2]' )[0].text phone = leg_page.xpath( '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[3]/td[2]' )[0].text email = leg_page.xpath( '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[4]/td[2]/a' )[0].text if party == 'Democrat': party = 'Democratic' leg = Legislator(chamber, term, district, full_name, first_name, last_name, middle_name, party, full_address=full_address, phone=phone, email=email, url=member_url) leg.add_source(member_url) leg.add_source(main_url) self.save_legislator(leg)
def scrape_member(self, chamber, term, member_url): with self.urlopen(member_url) as page: root = lxml.html.fromstring(page) name_and_party = root.xpath( 'string(//td[@class="SiteNames"])').split() title = name_and_party[0] if title == 'Representative': chamber = 'lower' elif title == 'Senator': chamber = 'upper' full_name = ' '.join(name_and_party[1:-1]) party = name_and_party[-1] if party == '(R)': party = 'Republican' elif party == '(D)': party = 'Democratic' img = root.xpath('//img[@class="SitePhotos"]')[0] photo_url = img.attrib['src'] # Need to figure out a cleaner method for this later info_box = root.xpath('string(//table[@class="InfoTable"])') district = re.search(r'District(.+)\r', info_box).group(1) leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url) leg.add_source(member_url) try: leg['email'] = re.search(r'Email(.+)\r', info_box).group(1) except AttributeError: pass try: leg['occupation'] = re.search(r'Occupation(.+)\r', info_box).group(1) except AttributeError: pass self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) abbr = {'D': 'Democratic', 'R': 'Republican'} if chamber == 'lower': url = 'http://house.michigan.gov/replist.asp' with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # skip two rows at top for row in doc.xpath('//table[@cellspacing=0]/tr')[2:]: tds = [x.text_content().strip() for x in row.xpath('td')] (district, last_name, first_name, party, office, phone, email) = tds leg = Legislator(term=term, chamber=chamber, district=str(int(district)), full_name=first_name + " " + last_name, first_name=first_name, last_name=last_name, party=abbr[party], office=office, phone=phone, email=email) leg.add_source(url) self.save_legislator(leg) else: url = 'http://www.senate.michigan.gov/members/memberlist.htm' with self.urlopen(url) as html: doc = lxml.html.fromstring(html) for row in doc.xpath('//table[@width=550]/tr')[1:39]: # party, dist, member, office_phone, office_fax, office_loc party = abbr[row.xpath('td[1]/text()')[0]] district = row.xpath('td[2]/a/text()')[0] name = row.xpath('td[3]/a/text()')[0] office_phone = row.xpath('td[4]/text()')[0] office_fax = row.xpath('td[5]/text()')[0] office_loc = row.xpath('td[6]/text()')[0] leg = Legislator(term=term, chamber=chamber, district=district, full_name=name, party=party, office_phone=office_phone, office_fax=office_fax, office_loc=office_loc) leg.add_source(url) self.save_legislator(leg)
def scrape_member(self, chamber, year, member_url): member_page = self.urlopen(member_url) doc = lxml.html.fromstring(member_page) photo_url = doc.xpath('//div[@id="bioImage"]/img/@src')[0] name_pieces = doc.xpath('//span[@id="name"]/text()')[0].split() full_name = ' '.join(name_pieces[1:-1]).strip() party = name_pieces[-1] if party == '(R)': party = 'Republican' elif party == '(D)': party = 'Democratic' elif party == '(I)': party = 'Independent' district = doc.xpath( '//span[@id="districtHeader"]/text()')[0].split()[-1] leg = Legislator(year, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url) leg.add_source(member_url) address = '\n'.join( doc.xpath( '//div[@id="FrankfortAddresses"]//span[@class="bioText"]/text()' )) phone = None phone_numbers = doc.xpath( '//div[@id="PhoneNumbers"]//span[@class="bioText"]/text()') for num in phone_numbers: if num.startswith('Annex: '): phone = num.replace('Annex: ', '') if address.strip() == "": self.warning("Missing Capitol Office!!") else: leg.add_office('capitol', 'Capitol Office', address=address, phone=phone) self.save_legislator(leg)
def scrape_senators(self, term): url = "http://www.flsenate.gov/Senators/" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//a[contains(@href, 'Senators/s')]"): name = link.text.strip() name = re.sub(r'\s+', ' ', name) leg_url = link.get('href') leg_doc = lxml.html.fromstring(self.urlopen(leg_url)) leg_doc.make_links_absolute(leg_url) if 'Vacant' in name: continue # Special case - name_tools gets confused # by 'JD', thinking it is a suffix instead of a first name if name == 'Alexander, JD': name = 'JD Alexander' elif name == 'Vacant': name = 'Vacant Seat' district = link.xpath("string(../../td[1])") party = link.xpath("string(../../td[2])") # for consistency if party == 'Democrat': party = 'Democratic' if term != '2013-2014': raise ValueError('Please change the senate photo_url string.') photo_url = leg_doc.xpath('//div[@id="sidebar"]//img/@src').pop() leg = Legislator(term, 'upper', district, name, party=party, photo_url=photo_url, url=leg_url) leg.add_source(url) leg.add_source(leg_url) self.scrape_sen_offices(leg, leg_url) self.save_legislator(leg)
def scrape_lower_legislator(self, url, leg_info, term): page = self.lxmlize(url) photo = xpath_one(page, '//img[@rel="lightbox"]').attrib['src'] infoblk = xpath_one( page, '//font/b[contains(text(), "CAUCUS/DELEGATION MEMBERSHIP")]') infoblk = infoblk.getparent() info = infoblk.text_content() cty = xpath_one(infoblk, "./b[contains(text(), 'ASSIGNMENTS')]") cty = cty.getnext() partyblk = filter(lambda x: "District" in x, page.xpath('//p[@align="center"]//text()'))[0] party_flags = { "Democrat": "Democratic", "Republican": "Republican", "Independent": "Independent" } if leg_info['name'].startswith("Vacant"): return party = 'other' for p in party_flags: if p in partyblk: party = party_flags[p] if party == 'other': raise Exception kwargs = {"url": url, "party": party, "photo_url": photo} leg = Legislator(term, 'lower', leg_info['dist'], leg_info['name'], **kwargs) kwargs = { "address": leg_info['office'], "phone": leg_info['phone'], } if leg_info['email'] != "": kwargs['email'] = leg_info['email'] leg.add_office('district', 'District Office', **kwargs) leg.add_source(url) self.save_legislator(leg)
def scrape_lower(self, term): url = 'http://le.utah.gov/house2/representatives.jsp' html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath('//tr')[1:]: tds = row.xpath('td') district = tds[0].text_content() if tds[1].text_content() == 'Empty': self.log('district %s is empty' % district) continue a = tds[1].xpath('a')[0] name = a.text_content() leg_url = a.get('href') party = tds[2].text_content() if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' else: raise ValueError('unknown party') # get photo leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) photo_url = leg_doc.xpath('//img[@alt="photo"]/@src')[0] email = leg_doc.xpath('//a[starts-with(@href, "mailto")]')[0].text address = leg_doc.xpath('//b[text()="Address:"]')[0].tail.strip() cell = leg_doc.xpath('//b[text()="Cell Phone:"]') if cell: cell = cell[0].tail.strip() else: cell = None leg = Legislator(term, 'lower', district, name, party=party, photo_url=photo_url, email=email, url=leg_url) leg.add_office('district', 'Home', address=address, phone=cell) leg.add_source(url) leg.add_source(leg_url) self.save_legislator(leg)
def scrape_rep(self, name, term, url): with self.urlopen(url) as text: page = lxml.html.fromstring(text) xpath = '//table[@id="table41"]/tr/td/font' name = page.xpath(xpath)[3].xpath('p')[0].text name = name.replace('Representative', '').strip().strip(',') district = page.xpath( "//a[contains(@href, 'district')]")[0].attrib['href'] district = re.search("district(\d+).pdf", district).group(1) if "Democrat District" in text: party = "Democratic" elif "Republican District" in text: party = "Republican" elif "Independent District" in text: party = "Independent" else: party = "Other" kwargs = {"party": party, "url": url} photo = page.xpath("//img[@rel='lightbox']") if len(photo) > 0: photo = photo[0] photo_url = "http://house.louisiana.gov/H_Reps/%s" % ( photo.attrib['src'] ) kwargs['photo_url'] = photo_url else: self.warning("No photo found :(") district_office = _get_b_tail(page, 'DISTRICT OFFICE') email = page.xpath('//a[starts-with(@href, "mailto")]/@href')[0] # split off extra parts of mailto: link email = email.split(':')[1].split('?')[0] leg = Legislator(term, 'lower', district, name, email=email, **kwargs) leg.add_office('district', 'District Office', address=district_office) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term_name): for t in self.metadata['terms']: if t['name'] == term_name: session = t['sessions'][-1] slug = self.metadata['session_details'][session]['slug'] if chamber == 'upper': chamber_slug = 'Senate' elif chamber == 'lower': chamber_slug = 'Assembly' leg_base_url = 'http://www.leg.state.nv.us/App/Legislator/A/%s/%s/' % (chamber_slug, slug) leg_json_url = 'http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s' % (slug, chamber_slug) resp = json.loads(self.urlopen(leg_json_url)) for item in resp: # empty district if 'District No' in item['FullName']: continue leg = Legislator(term_name, chamber, item['DistrictNbr'], item['FullName'], party=item['Party'], photo_url=item['PhotoURL']) leg_url = leg_base_url + item['DistrictNbr'] # fetch office from legislator page try: doc = lxml.html.fromstring(self.urlopen(leg_url)) if not doc.xpath('//div'): self.warning('invalid page, maybe a weird PDF?') else: address = doc.xpath('//div[@class="contactAddress"]')[0].text_content() address2 = doc.xpath('//div[@class="contactAddress2"]') if address2: address += ' ' + address2[0].text_content() address += '\n' + doc.xpath('//div[@class="contactCityStateZip"]')[0].text_content() phone = doc.xpath('//div[@class="contactPhone"]')[0].text_content() leg.add_office('district', 'District Address', address=address, phone=phone) except scrapelib.HTTPError: self.warning('could not fetch %s' % leg_url) pass leg.add_source(leg_url) self.save_legislator(leg)
def scrape(self, chamber, term): url = self.urls[term][chamber] version = self.urls[term]['version'] if url is None: raise NoDataForPeriod(term) with self.urlopen(url) as page: page = lxml.html.fromstring(page) for row in page.xpath("//tr")[1:]: name = row.xpath("td")[0].text_content() name = name.split(",") if len(name) == 2: fullname = "%s %s" % (name[1].strip(), name[0].strip()) elif len(name) == 3: fullname = "%s %s, %s" % (name[1].strip(), name[0].strip(), name[2].strip()) else: fullname = ' '.join(name).strip() # Most recent general assembly legislators list is slightly different than archived versions if version >= 2: party = row.xpath("td")[1].text_content().strip() district = row.xpath("td")[3].text_content().replace( "District ", "").strip() phone = email = '' if version >= 3: phone = row.xpath("td")[6].text_content().strip() email = row.xpath("td")[6].text_content().strip() else: party, district = row.xpath("td")[1].text_content().split( "-") party = party.strip() district = district.strip() phone = email = '' leg = Legislator(term, chamber, district, fullname, party=party, email=email) leg.add_source(url) self.save_legislator(leg)
def scrape_upper(self, term): url = 'http://www.utahsenate.org/aspx/roster.aspx' html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath('//tr')[1:]: tds = row.xpath('td') # 1st has district district = tds[0].text_content() # 3rd has name and email person = tds[2].xpath('span[@class="person"]')[0] if '(D)' in person.text_content(): party = 'Democratic' elif '(R)' in person.text_content(): party = 'Republican' else: raise ValueError('unknown party') a = person.xpath('a')[0] name = a.text_content() leg_url = a.get('href') email = tds[2].xpath('span[@class="email"]/a/text()')[0] # text is split by br in 4th td, join with a space address = ' '.join(row.xpath('td[4]/font/text()')) # get photo leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) photo_url = leg_doc.xpath('//p[@class="photo"]/img/@src')[0] leg = Legislator(term, 'upper', district, name, party=party, email=email, address=address, photo_url=photo_url, url=leg_url) leg.add_source(url) leg.add_source(leg_url) self.save_legislator(leg)
def scrape_2011Leg(self, chamber, term, url): """2011 Scraper for legislators""" parties = {'(D)': 'Democratic', '(R)': 'Republican'} with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) table = page.xpath('//table[contains(@id, "GridView1")]')[0] for row in table.xpath('tr[td/a[contains(@href, "memberpage")]]'): params = {} district = row.xpath( 'td/span[contains(@id, "LabelDistrict")]/font')[0].text last_name_a = row.xpath( 'td/a[contains(@id, "HyperLinkLast")]')[0] member_url = last_name_a.get('href') last_name = last_name_a.text_content().strip() first_names = row.xpath( 'td/span[contains(@id, "LabelFirst")]/font')[0].text.strip( ) first_name = first_names.split()[0] middle_name = ' '.join(first_names.split()[1:]) party = row.xpath( 'td/span[contains(@id, "LabelParty")]/font')[0].text party = parties[party] params['office_address'] = row.xpath('td/span[contains(@id, "LabelRoom")]')[0].text + \ " " + row.xpath('td/span[contains(@id, "LabelRoom2")]')[0].text params['photo_url'] = row.xpath( 'td/a[contains(@id, "HyperLinkChairJPG")]/img' )[0].attrib['src'] params['email'] = row.xpath( 'td/a[contains(@id, "HyperLinkEmail")]')[0].text params['phone'] = row.xpath( 'td/span[contains(@id, "LabelPhone2")]')[0].text full_name = first_names + " " + last_name leg = Legislator(term, chamber, district, full_name, first_name, last_name, middle_name, party, url=member_url, **params) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): if chamber == 'lower': url = 'http://www.scstatehouse.gov/html-pages/housemembers.html' else: url = 'http://www.scstatehouse.gov/html-pages/senatemembersd.html' with self.urlopen(url) as data: doc = lxml.html.fromstring(data) rows = doc.xpath('//pre/div[@class="sansSerifNormal"]') for row in rows[1:]: member_a = row.xpath('a')[0] name_party = member_a.text_content() if name_party.find('[D]') != -1: party = 'Democratic' full_name = name_party.partition('[D]')[0].strip() elif name_party.find('[R]') != -1: party = 'Republican' full_name = name_party.partition('[R]')[0].strip() photo_url = 'http://www.scstatehouse.gov/members/gif/' + re.search( '(\d+)\.html', member_a.attrib['href']).group(1) + '.jpg' other_data = row.text_content().encode('ascii', 'ignore') od_result = re.search('^.+District (\d+) - (.+)Count.+$', other_data) district = od_result.group(1) contentb = re.search( '^.+\(C\) (.+,.*\d+).*Bus. (\(\d+\) \d+-\d+).+$', other_data) if contentb is not None: office_address = contentb.group(1) office_phone = contentb.group(2) legislator = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, office_address=office_address, office_phone=office_phone) legislator.add_source(url) self.save_legislator(legislator)
def get_member(self, term, chamber, kpid): url = '%smembers/%s' % (ksapi.url, kpid) content = json.loads(self.get(url).text)['content'] party = content['PARTY'] if party == 'Democrat': party = 'Democratic' slug = {'2013-2014': 'b2013_14', '2015-2016': 'b2015_16'}[term] leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid) try: legislator_page = self.lxmlize(leg_url) (photo_url, ) = legislator_page.xpath('//img[@class="profile-picture"]/@src') except scrapelib.HTTPError: self.warning("{}'s legislator bio page not found".format( content['FULLNAME'])) leg_url = '' photo_url = '' legislator = Legislator( term, chamber, str(content['DISTRICT']), content['FULLNAME'], party=party, url=leg_url, photo_url=photo_url, occupation=content['OCCUPATION'], ) address = ('Room %s\n' 'Kansas State Capitol Building\n' '300 SW 10th St.\n' 'Topeka, KS 66612') % content['OFFICENUM'] legislator.add_office('capitol', 'Capitol Office', phone=content['OFFPH'] or None, address=address, email=content['EMAIL']) legislator.add_source(url) self.save_legislator(legislator)
def scrape_lower(self, term): url = "http://www.okhouse.gov/Members/Default.aspx" page = lxml.html.fromstring(self.urlopen(url)) for link in page.xpath("//a[contains(@href, 'District')]")[3:]: name = link.text.strip() district = link.xpath("string(../../td[3])").strip() party = link.xpath("string(../../td[4])").strip() if party == 'R': party = 'Republican' elif party == 'D': party = 'Democratic' leg = Legislator(term, 'lower', district, name, party=party) leg.add_source(url) self.save_legislator(leg)
def _scrape_upper(self, roster_page, roster_url, term): # TODO: photo_urls http://www.senate.texas.gov/members.php for tbl in roster_page.xpath('//table[@class="memdir"]'): leg_a = tbl.xpath('.//a')[0] name = leg_a.text leg_url = leg_a.get('href') district = tbl.xpath( './/span[contains(text(), "District:")]')[0].tail.lstrip('0') party = tbl.xpath('.//span[contains(text(), "Party:")]')[0].tail legislator = Legislator(term, 'upper', district, name, party=party, url=leg_url) for addr in tbl.xpath('.//td[@headers]'): fax = phone = address = None lines = [addr.text] for child in addr.getchildren(): # when we get to span tag we just ingested a phone # if child.tag == 'span' and child.text: if 'TEL' in child.text: phone = lines.pop() elif 'FAX' in child.text: fax = lines.pop() elif child.tail: lines.append(child.tail) address = '\n'.join(line.strip() for line in lines if line) if 'CAP' in addr.get('headers'): office_type = 'capitol' office_name = 'Capitol Office' else: office_type = 'district' office_name = 'District Office' legislator.add_office(office_type, office_name, address=address, phone=phone, fax=fax) legislator.add_source(roster_url) legislator.add_source(leg_url) self.save_legislator(legislator)
def scrape_senators(self, term): url = "http://www.flsenate.gov/Senators/" with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//a[contains(@href, 'Senators/s')]"): name = link.text.strip() name = re.sub(r'\s+', ' ', name) leg_url = link.get('href') if 'Vacant' in name: continue # Special case - name_tools gets confused # by 'JD', thinking it is a suffix instead of a first name if name == 'Alexander, JD': name = 'JD Alexander' elif name == 'Vacant': name = 'Vacant Seat' district = link.xpath("string(../../td[1])") party = link.xpath("string(../../td[2])") # for consistency if party == 'Democrat': party = 'Democratic' photo_url = ("http://www.flsenate.gov/userContent/" "Senators/2010-2012/photos/s%03d.jpg" % (int(district))) leg = Legislator(term, 'upper', district, name, party=party, photo_url=photo_url, url=leg_url) leg.add_source(url) leg.add_source(leg_url) self.scrape_sen_offices(leg, leg_url) self.save_legislator(leg)
def scrape(self, term, chambers): base_url = 'http://news.legislature.ne.gov/dist' #there are 49 districts for district in range(1, 50): if district < 10: rep_url = base_url + '0' + str(district) + '/biography/' else: rep_url = base_url + str(district) + '/biography/' try: html = self.get(rep_url).text page = lxml.html.fromstring(html) full_name = page.xpath('//div[@class="content_header_right"]/a')[0].text.split(' ',1)[1].strip() if full_name == 'Seat Vacant': continue # This is hacky, are lis always the same? address = page.xpath('//div[@id="sidebar"]/ul[1]/li[3]')[0].text.strip() + '\n' address += page.xpath('//div[@id="sidebar"]/ul[1]/li[4]')[0].text.strip() + '\n' address += page.xpath('//div[@id="sidebar"]/ul[1]/li[5]')[0].text.strip() phone = page.xpath('//div[@id="sidebar"]/ul[1]/li[6]')[0].text.split() if len(phone) > 2: phone = phone[1] + ' ' + phone[2] else: phone = None mailto = page.xpath('//div[@id="sidebar"]/ul[1]/li[contains(text(), "Email:")]/a/@href')[0] email = mailto[7:] photo_url = \ "http://www.nebraskalegislature.gov/media/images/blogs/dist%02d.jpg" \ % district #Nebraska is offically nonpartisan party = 'Nonpartisan' leg = Legislator(term, 'upper', str(district), full_name, party=party, email=email, url=rep_url, photo_url=photo_url) leg.add_source(rep_url) leg.add_office('capitol', 'Capitol Office', address=address, phone=phone) self.save_legislator(leg) except scrapelib.HTTPError: self.warning('could not retrieve %s' % rep_url)
def scrape_senate(self, term): urls = ( 'http://www.senadopr.us/senadores/Pages/Senadores%20Acumulacion.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20I.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20II.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20III.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20IV.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20V.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VI.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VII.aspx', 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VIII.aspx') for counter, url in enumerate(urls): with self.urlopen(url) as leg_page_html: doc = lxml.html.fromstring(leg_page_html) table = doc.xpath( '//table[@summary="Listado de Senadores"]')[0] # skip first row for row in table.xpath('tr')[1:]: tds = row.xpath('td') img = row.xpath('.//img/@src') if len(img) != 0: photo_url = img[0] name = tds[1].text_content().title() party = tds[2].text_content() phone = tds[3].text_content() email = tds[4].text_content() if counter == 0: district = 'At-Large' else: district = str(counter) leg = Legislator(term, 'upper', district, name, party=party, photo_url=photo_url, phone=phone, email=email) leg.add_source(url) self.save_legislator(leg)
def scrape_2011Leg(self, chamber, term, url): """2011 Scraper for legislators""" titles = {'lower': 'Representative', 'upper': 'Senator'} parties = {'D': 'Democrat', 'R': 'Republican'} with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) table = page.xpath('//table[contains(@id, "GridView1")]')[0] for row in table.xpath('tr[td/a[contains(@href, "memberpage")]]'): params = {} district = row.xpath('td/span[contains(@id, "LabelDis")]/font')[0].text + " " + \ row.xpath('td/span[contains(@id, "LabelDistrict2")]/font')[0].text # Replace any / in district name to allow json file to save. district = district.replace('/', '-') params['title'] = titles.get(chamber, '') last_name = row.xpath( 'td/a[contains(@id, "HyperLinkLast")]/font')[0].text.strip( ) first_names = row.xpath( 'td/span[contains(@id, "LabelFirst")]/font')[0].text.strip( ) first_name = first_names.split()[0] middle_name = ' '.join(first_names.split()[1:]) party = row.xpath( 'td/span[contains(@id, "LabelParty")]/font')[0].text party = party.replace('(', '') party = party.replace(')', '') party = parties.get(party, '') # Expand party from initial letter. params['office_address'] = row.xpath('td/span[contains(@id, "LabelRoom")]')[0].text + \ " " + row.xpath('td/span[contains(@id, "LabelRoom2")]')[0].text params['photo_url'] = row.xpath( 'td/a[contains(@id, "HyperLinkChairJPG")]/img' )[0].attrib['src'] params['email'] = row.xpath( 'td/a[contains(@id, "HyperLinkEmail")]')[0].text params['phone'] = row.xpath( 'td/span[contains(@id, "LabelPhone2")]')[0].text full_name = first_names + " " + last_name leg = Legislator(term, chamber, district, full_name, first_name, last_name, middle_name, party, **params) leg.add_source(url) self.save_legislator(leg)
def scrape_senate(self, term): BASE_URL = 'http://www.senate.leg.state.mn.us/' url = 'http://www.senate.leg.state.mn.us/members/member_list.php' with self.urlopen(url) as html: doc = lxml.html.fromstring(html) for row in doc.xpath('//tr'): tds = row.xpath('td') if len(tds) == 5 and tds[1].text_content() in self._parties: district = tds[0].text_content().lstrip('0') party = tds[1].text_content() name_a = tds[2].xpath('a')[0] name = name_a.text.strip() leg_url = BASE_URL + name_a.get('href') addr, phone = tds[3].text_content().split(u'\xa0\xa0') email = tds[4].text_content() leg = Legislator(term, 'upper', district, name, party=self._parties[party], url=leg_url) if 'State Office' in addr: addr = ('100 Rev. Dr. Martin Luther King Jr. Blvd.\n' 'Room {0}\n' 'St. Paul, MN 55155-1206').format(addr) elif 'Capitol' in addr: addr = ('75 Rev. Dr. Martin Luther King Jr. Blvd.\n' 'Room {0}\n' 'St. Paul, MN 55155-1606').format(addr) leg.add_office('capitol', 'Capitol Office', address=addr, phone=phone) if '@' in email: leg['email'] = email with self.urlopen(leg_url) as leg_html: leg_doc = lxml.html.fromstring(leg_html) img_src = leg_doc.xpath('//img[@height=164]/@src') if img_src: leg['photo_url'] = BASE_URL + img_src[0] leg.add_source(url) self.save_legislator(leg)
def scrape_legislator(self, chamber, term, name, url): page = self.urlopen(url) page = lxml.html.fromstring(page) name = re.sub(r'\s+', ' ', name) info = page.xpath('string(//div[@id = "fullpage"])') district = re.search(r'District: ([\w\d]+)', info) if district is None: maddr = page.xpath( "//div[@id='fullpage']//a[contains(@href, 'mailto')]") if maddr == []: return # Needed for http://senate.legis.state.ak.us/senator.php?id=cog .. maddr = maddr[0] district = maddr.getnext().tail # This hack needed for http://house.legis.state.ak.us/rep.php?id=dru # please remove as soon as this is alive. else: district = district.group(1) party = re.search(r'Party: (.+)', info).group(1).strip() email = re.search(r'Email: ([\w_]+@legis\.state\.ak\.us)', info) if email is None: email = re.search(r'Email: (.+@akleg\.gov)', info) email = email.group(1) # for consistency if party == 'Democrat': party = 'Democratic' leg = Legislator(term, chamber, district, name, party=party, email=email, url=url) self.scrape_address(leg, page, 'bioleft') self.scrape_address(leg, page, 'bioright') leg.add_source(url) self.save_legislator(leg)