def scrape_reps(self, year): rep_url = 'http://www.house.state.tx.us/members/welcome.php' with self.urlopen_context(rep_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) for el in root.xpath('//form[@name="frmMembers"]/table/tr')[1:]: full_name = el.xpath('string(td/a/font/span)') district = el.xpath('string(td[2]/span)') county = el.xpath('string(td[3]/span)') if full_name.startswith('District'): # Ignore empty seats continue pre, first, last, suffixes = name_tools.split(full_name) party = '' leg = Legislator('81', 'lower', district, full_name, first, last, '', party, suffix=suffixes) leg.add_source(rep_url) # Is there anything out there that handles meta refresh? redirect_url = el.xpath('td/a')[0].attrib['href'] redirect_url = ('http://www.house.state.tx.us/members/' + redirect_url) details_url = redirect_url with self.urlopen_context(redirect_url) as redirect_page: redirect = lxml.etree.fromstring(redirect_page, lxml.etree.HTMLParser()) try: filename = redirect.xpath( "//meta[@http-equiv='refresh']" )[0].attrib['content'] filename = filename.split('0;URL=')[1] details_url = details_url.replace('welcome.htm', filename) except: # The Speaker's member page does not redirect. # The Speaker is not on any committees # so we can just continue with the next member. self.save_legislator(leg) continue with self.urlopen_context(details_url) as details_page: details = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) comms = details.xpath( "//b[contains(text(), 'Committee Assignments')]/" "..//a") for comm in comms: leg.add_role('committee member', '81', committee=comm.text.strip()) self.save_legislator(leg)
def scrape_senators(self, year): senator_url = 'http://www.senate.state.tx.us/75r/senate/senmem.htm' with self.urlopen_context(senator_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) for el in root.xpath('//table[@summary="senator identification"]'): sen_link = el.xpath('tr/td[@headers="senator"]/a')[0] full_name = sen_link.text district = el.xpath('string(tr/td[@headers="district"])') party = el.xpath('string(tr/td[@headers="party"])') pre, first, last, suffixes = name_tools.split(full_name) leg = Legislator('81', 'upper', district, full_name, first, last, '', party, suffix=suffixes) leg.add_source(senator_url) details_url = ('http://www.senate.state.tx.us/75r/senate/' + sen_link.attrib['href']) with self.urlopen_context(details_url) as details_page: details = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) comms = details.xpath("//h2[contains(text(), 'Committee Membership')]")[0] comms = comms.getnext() for comm in comms.xpath('li/a'): comm_name = comm.text if comm.tail: comm_name += comm.tail leg.add_role('committee member', '81', committee=comm_name.strip()) self.save_legislator(leg)
def scrape_legislators(self, chamber, year): if year != "2009": raise NoDataForYear l1 = Legislator("2009-2010", chamber, "1st", "Bob Smith", "Bob", "Smith", "", "Democrat") if chamber == "upper": l1.add_role("President of the Senate", "2009-2010") else: l1.add_role("Speaker of the House", "2009-2010") l1.add_source("http://example.com/Bob_Smith.html") l2 = Legislator("2009-2010", chamber, "2nd", "Sally Johnson", "Sally", "Johnson", "", "Republican") l2.add_role("Minority Leader", "2009-2010") l2.add_source("http://example.com/Sally_Johnson.html") self.save_legislator(l1) self.save_legislator(l2)