示例#1
0
    def _scrape_upper(self, roster_page, term):
        member_urls = roster_page.xpath('(//table[caption])[1]//a/@href')
        # Sort by district for easier spotting of omissions:
        member_urls.sort(key=lambda url: int(re.search(
            r'\d+(?=\.htm)', url).group()))

        for member_url in member_urls:
            self._scrape_senator(member_url, term)

        # Handle Lt. Governor (President of the Senate) separately:
        url = 'http://www.senate.state.tx.us/75r/LtGov/Ltgov.htm'
        page = lxml.html.fromstring(self.get(url).text)
        name = page.xpath('//div[@class="memtitle"]/text()')[0] \
                   .replace('Lieutenant Governor', '').strip()

        # A safe assumption for lack of information on official member page or
        # party listings:
        party = 'Republican'

        lt_governor = Person(name)
        lt_governor.add_role('Lt. Governor', term, party=party)
        lt_governor.add_source(url)
        self.save_legislator(lt_governor)
示例#2
0
    def _scrape_upper(self, roster_page, term):
        member_urls = roster_page.xpath('(//table[caption])[1]//a/@href')
        # Sort by district for easier spotting of omissions:
        member_urls.sort(
            key=lambda url: int(re.search(r'\d+(?=\.htm)', url).group()))

        for member_url in member_urls:
            self._scrape_senator(member_url, term)

        # Handle Lt. Governor (President of the Senate) separately:
        url = 'http://www.senate.state.tx.us/75r/LtGov/Ltgov.htm'
        page = lxml.html.fromstring(self.get(url).text)
        name = page.xpath('//div[@class="memtitle"]/text()')[0] \
                   .replace('Lieutenant Governor', '').strip()

        # A safe assumption for lack of information on official member page or
        # party listings:
        party = 'Republican'

        lt_governor = Person(name)
        lt_governor.add_role('Lt. Governor', term, party=party)
        lt_governor.add_source(url)
        self.save_legislator(lt_governor)
示例#3
0
    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)

            sdiv = root.xpath('//div[@class="subtitle"]')[0]
            table = sdiv.getnext()

            photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                    '_imgMember"]')[0].attrib['src']

            td = table.xpath('//td[@valign="top"]')[0]

            type = td.xpath('string(//div[1]/strong)').strip()

            full_name = td.xpath('string(//div[2]/strong)').strip()
            full_name = re.sub(r'\s+', ' ', full_name)

            district = td.xpath('string(//div[3])').strip()
            district = district.replace('District ', '')

            addrs = {}
            for atype, text in (('capital_address', 'Capitol address:'),
                                ('district_address', 'District address:')):
                aspan = root.xpath("//span[. = '%s']" % text)
                addrs[atype] = None

                if aspan:
                    addrs[atype] = aspan[0].tail
                    elem = aspan[0].getnext()
                    while elem is not None and elem.tag == 'br':
                        if elem.tail:
                            addrs[atype] += "\n" + elem.tail
                        elem = elem.getnext()

            party = td.xpath('string(//div[4])').strip()[0]
            if party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'

            if type == 'Lt. Gov.':
                leg = Person(full_name)
                leg.add_role('Lt. Governor', term, party=party, **addrs)
            else:
                leg = Legislator(term,
                                 chamber,
                                 district,
                                 full_name,
                                 party=party,
                                 photo_url=photo_url,
                                 **addrs)

            leg.add_source(urlescape(member_url))

            comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                                  '/following-sibling::div'
                                  '[@class="rcwcontent"]')[0]

            for link in comm_div.xpath('*/a'):
                name = link.text

                if '(Vice Chair)' in name:
                    mtype = 'vice chair'
                elif '(Chair)' in name:
                    mtype = 'chair'
                else:
                    mtype = 'member'

                name = clean_committee_name(link.text)

                # There's no easy way to determine whether a committee
                # is joint or not using the mobile legislator directory
                # (without grabbing a whole bunch of pages, at least)
                # so for now we will hard-code the one broken case
                if (name == "Oversight of HHS Eligibility System"
                        and term == '82'):
                    comm_chamber = 'joint'
                else:
                    comm_chamber = chamber

                if name.startswith('Appropriations-S/C on '):
                    sub = name.replace('Appropriations-S/C on ', '')
                    leg.add_role('committee member',
                                 term,
                                 chamber=comm_chamber,
                                 committee='Appropriations',
                                 subcommittee=sub,
                                 position=mtype)
                else:
                    leg.add_role('committee member',
                                 term,
                                 chamber=comm_chamber,
                                 committee=name,
                                 position=mtype)

            if type == 'Lt. Gov.':
                self.save_person(leg)
            else:
                if district:
                    self.save_legislator(leg)
示例#4
0
    def scrape_member(self, chamber, term, member_url):
        page = self.urlopen(member_url)
        root = lxml.html.fromstring(page)
        root.make_links_absolute(member_url)

        sdiv = root.xpath('//div[@class="subtitle"]')[0]
        table = sdiv.getnext()

        photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                '_imgMember"]')[0].attrib['src']

        td = table.xpath('//td[@valign="top"]')[0]

        type = td.xpath('string(//div[1]/strong)').strip()

        full_name = td.xpath('string(//div[2]/strong)').strip()
        full_name = re.sub(r'\s+', ' ', full_name)

        district = td.xpath('string(//div[3])').strip()
        district = district.replace('District ', '')

        party = td.xpath('string(//div[4])').strip()[0]
        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'

        if type == 'Lt. Gov.':
            leg = Person(full_name)
            leg.add_role('Lt. Governor', term, party=party)
        else:
            leg = Legislator(term, chamber, district, full_name,
                             party=party, photo_url=photo_url,
                             url=member_url)

        leg.add_source(urlescape(member_url))

        # add addresses
        for atype, text in (('capitol', 'Capitol address'),
                            ('district', 'District address')):
            aspan = root.xpath("//span[. = '%s:']" % text)
            addr = ''
            phone = None
            if aspan:
                # cycle through brs
                addr = aspan[0].tail.strip()
                elem = aspan[0].getnext()
                while elem is not None and elem.tag == 'br':
                    if elem.tail:
                        if not phone_re.match(elem.tail):
                            addr += "\n" + elem.tail
                        else:
                            phone = elem.tail
                    elem = elem.getnext()
                # now add the addresses
                leg.add_office(atype, text, address=addr, phone=phone)

        # add committees
        comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                              '/following-sibling::div'
                              '[@class="rcwcontent"]')[0]

        for link in comm_div.xpath('*/a'):
            name = link.text

            if '(Vice Chair)' in name:
                mtype = 'vice chair'
            elif '(Chair)' in name:
                mtype = 'chair'
            else:
                mtype = 'member'

            name = clean_committee_name(link.text)

            # There's no easy way to determine whether a committee
            # is joint or not using the mobile legislator directory
            # (without grabbing a whole bunch of pages, at least)
            # so for now we will hard-code the one broken case
            if (name == "Oversight of HHS Eligibility System" and
                term == '82'):
                comm_chamber = 'joint'
            else:
                comm_chamber = chamber

            if name.startswith('Appropriations-S/C on '):
                sub = name.replace('Appropriations-S/C on ', '')
                leg.add_role('committee member', term,
                             chamber=comm_chamber,
                             committee='Appropriations',
                             subcommittee=sub,
                             position=mtype)
            else:
                leg.add_role('committee member', term,
                             chamber=comm_chamber,
                             committee=name,
                             position=mtype)

        if type == 'Lt. Gov.':
            self.save_object(leg)
        else:
            if district:
                self.save_legislator(leg)
    def scrape(self, term, chambers):
        # The mayor doesn't sit on council.
        url = 'http://www.phila.gov/'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        # The mayor's name doesn't appear on the mayor's page!
        full_name  = re.search('Mayor (.+)', doc.xpath('//title/text()')[0].strip()).group(1)
        first_name, middle_name, last_name = parse_full_name(full_name)
        mayor = Person(full_name, first_name, last_name, middle_name)
        mayor.add_source(url)

        url = 'http://www.phila.gov/mayor/'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        lines   = map(clean_string, doc.xpath('//div[contains(text(),"Mailing Address")]/following-sibling::text()')[1:])
        address = '\n'.join(lines)
        phone   = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Phone")]/following-sibling::text()[1]')[0]).groups())
        fax     = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Fax")]/following-sibling::text()[1]')[0]).groups())
        email   = clean_string(doc.xpath('//strong[contains(text(),"Email")]/following-sibling::text()[1]')[0])

        mayor.update(dict(url=url, email=email))
        mayor.add_office('capitol', 'Office of the Mayor', address=address, phone=phone, fax=fax)
        mayor.add_role('Mayor', term)
        mayor.add_source(url)

        self.save_object(mayor)



        council_url = 'http://philadelphiacitycouncil.net/council-members/'
        doc = lxml.html.fromstring(self.urlopen(council_url))
        doc.make_links_absolute(council_url)

        urls = set(doc.xpath('//a[contains(@href, "/council-members/council")]/@href'))
        assert len(urls) <= 17, 'expected 17 unique councilmember URLs, found %d' % len(urls)

        for url in urls:
            doc = lxml.html.fromstring(self.urlopen(url))
            doc.make_links_absolute(url)

            optional  = dict() # fields not all legislators will have
            full_name = []
            first_name  = ''
            middle_name = ''
            last_name   = ''
            suffixes    = ''
            roles     = []
            lines     = []
            lines_office2 = []
            has_office2 = bool(False)
            reached_contact_form = bool(False)
            phone1    = None
            phone1_office2 = None
            phone2    = None
            phone2_office2 = None
            fax       = None
            fax_office2 = None
            office_name = None
            district  = 'At-Large' # default
            photo_url = (
                doc.xpath('//img[contains(@title, "brian picture")]/@src') or  # Special case for BRIAN J. O’NEILL
                doc.xpath('//img[contains(@class, "size-full")]/@src') or
                doc.xpath('//img[contains(@class, "size-medium")]/@src') or
                doc.xpath('//img[contains(@class, "size-thumbnail")]/@src')
            )[0]

            # That's an en dash, not a hyphen.
            parts = re.split(u'[,–]', doc.xpath('//h3/text()')[0])
            for index, part in enumerate(filter(None, parts)):
                part = clean_string(part)
                if index == 0:
                    if 'Councilman' in part:
                        optional['gender'] = 'Male'
                    elif 'Councilwoman' in part:
                        optional['gender'] = 'Female'
                    elif 'Council President' in part:
                        roles.append('Council President')
                    part = re.sub('^Council(?:man|woman| President)\s+', '', part)
                    full_name.append(part)
		    first_name, middle_name, last_name = parse_full_name(full_name[0])
                elif part in ('Jr.', 'Sr.'):
                    full_name.append(part)
		    suffixes = part
                elif 'District' in part:
                    district = part
                else:
                    roles.append(part)
            full_name = ', '.join(full_name)

            contact_url = doc.xpath('//a[text()="Contact"]/@href')[0]
            doc = lxml.html.fromstring(self.urlopen(contact_url))
            doc.make_links_absolute(contact_url)

            # @todo email, personal_url are sometimes in another paragraph.

            parts = doc.xpath('//div[@class="post-entry"]//text()')
            parts = map(clean_string, parts)
	    consuming_address_lines = bool(False)
            for part in filter(None, parts):
 
		# Special case for Curtis Jones Jr.
                if re.match(r'^Local Office:', part):
		    consuming_address_lines = True
                    has_office2 = True
		    office_name = 'Local Office'

                if re.match(r'City Hall Office', part) or re.match(r'^Hours', part) or re.match(r'.*facebook', part) or re.match(r'.*twitter', part) or reached_contact_form:
		    continue

                elif re.match(r'^Contact Council.*man', part) or re.match(r'^Contact CMAL', part):
		    reached_contact_form = True
                    continue

                elif re.match(r'^City Hall.+Room', part):
		    consuming_address_lines = True
                    lines.append(part)

                elif re.match(r'^FAX:', part, re.I) or re.match(r'^F:', part, re.I):
		    consuming_address_lines = False
                    if has_office2 and fax_office2 == None:
               		fax_office2 = '-'.join(tel_regex.search(part).groups())
                    elif fax == None:
               		fax = '-'.join(tel_regex.search(part).groups())

                elif tel_regex.search(part):
		    consuming_address_lines = False
                    if has_office2 and phone1_office2 == None and phone2_office2 == None:
			phone1_office2, phone2_office2 = parse_phones(part)
                    elif phone1 == None and phone2 == None:
			phone1, phone2 = parse_phones(part)

                elif '@' in part:
		    consuming_address_lines = False
                    optional['email'] = re.search('\S+@\S+', part).group()

                elif re.match(r'^Neighborhood Office.*', part):
		    consuming_address_lines = False
                    has_office2 = True

                elif re.match(r'.*Office.*', part) or re.match(r'.*Heroes Hall.*', part):

		    # Special case for Curtis Jones Jr.
		    if re.match(r'.*Local Office.*', part):
			continue

		    if len(lines_office2) > 0:
			consuming_address_lines = False
		    else:
			consuming_address_lines = True
			office_name =  string.strip(part, ':;,.')

                elif consuming_address_lines:
                    if has_office2:
                    	lines_office2.append(cleanup_address(part, False))
                    else:
			lines.append(cleanup_address(part))

                elif re.match(r'^(?:, )?Philadelphia, PA(?: 19107(?:-3290)?)?$', part):
                    pass

                else:
                    self.logger.warning('Skipped: ' + part)

            # Some Councilmembers have no zip code or only a 5-digit zip code.
            # All that changes between them is a room number.
            address = '\n'.join(lines)
            address_office2 = '\n'.join(lines_office2)

            legislator = Legislator(term, 'upper', district, full_name, first_name, last_name, middle_name, suffixes=suffixes, url=url, photo_url=photo_url, party=None)
            legislator.update(optional)

	    if re.search('.*\S.*', address):
      		legislator.add_office('capitol', 'City Hall Office', address=address, phone=phone1, secondary_phone=phone2, fax=fax)

	    if re.search('.*\S.*', address_office2):
      		legislator.add_office('district', office_name, address=address_office2, phone=phone1_office2, secondary_phone=phone2_office2, fax=fax_office2)

            legislator.add_source(url)

            for role in roles:
                legislator.add_role(role, term)

            self.save_legislator(legislator)
示例#6
0
    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)

            sdiv = root.xpath('//div[@class="subtitle"]')[0]
            table = sdiv.getnext()

            photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                    '_imgMember"]')[0].attrib['src']

            td = table.xpath('//td[@valign="top"]')[0]

            type = td.xpath('string(//div[1]/strong)').strip()

            full_name = td.xpath('string(//div[2]/strong)').strip()
            full_name = re.sub(r'\s+', ' ', full_name)

            district = td.xpath('string(//div[3])').strip()
            district = district.replace('District ', '')

            addrs = {}
            for atype, text in (('capital_address', 'Capitol address:'),
                                ('district_address', 'District address:')):
                aspan = root.xpath("//span[. = '%s']" % text)
                addrs[atype] = None

                if aspan:
                    addrs[atype] = aspan[0].tail
                    elem = aspan[0].getnext()
                    while elem is not None and elem.tag == 'br':
                        if elem.tail:
                            addrs[atype] += "\n" + elem.tail
                        elem = elem.getnext()

            party = td.xpath('string(//div[4])').strip()[0]
            if party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'

            if type == 'Lt. Gov.':
                leg = Person(full_name)
                leg.add_role('Lt. Governor', term, party=party, **addrs)
            else:
                leg = Legislator(term, chamber, district, full_name,
                                 party=party, photo_url=photo_url,
                                 **addrs)

            leg.add_source(member_url)

            comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                                  '/following-sibling::div'
                                  '[@class="rcwcontent"]')[0]

            for link in comm_div.xpath('*/a'):
                name = link.text

                if '(Vice Chair)' in name:
                    mtype = 'vice chair'
                elif '(Chair)' in name:
                    mtype = 'chair'
                else:
                    mtype = 'member'

                name = clean_committee_name(link.text)

                if name.startswith('Appropriations-S/C on '):
                    sub = name.replace('Appropriations-S/C on ', '')
                    leg.add_role('committee member', term,
                                 chamber=chamber,
                                 committee='Appropriations',
                                 subcommittee=sub,
                                 position=mtype)
                else:
                    leg.add_role('committee member', term,
                                 chamber=chamber,
                                 committee=name,
                                 position=mtype)

            if type == 'Lt. Gov.':
                self.save_person(leg)
            else:
                if district:
                    self.save_legislator(leg)
示例#7
0
    def scrape_member(self, chamber, term, member_url):
        page = self.get(member_url).text
        root = lxml.html.fromstring(page)
        root.make_links_absolute(member_url)

        sdiv = root.xpath('//div[@class="subtitle"]')[0]
        table = sdiv.getnext()

        photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                '_imgMember"]')[0].attrib['src']

        td = table.xpath('//td[@valign="top"]')[0]

        type = td.xpath('string(//div[1]/strong)').strip()

        full_name = td.xpath('//div/strong/text()')
        full_name = [re.sub(r'\s+', ' ', x).strip() for x in full_name]
        if full_name == []:
            self.warning("ERROR: CAN'T GET FULL NAME")
            return

        full_name = full_name[-1]

        district = td.xpath('string(//div[3])').strip()
        district = district.replace('District ', '')

        party = td.xpath('string(//div[4])').strip()[0]
        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'

        if type == 'Lt. Gov.':
            leg = Person(full_name)
            leg.add_role('Lt. Governor', term, party=party)
        else:
            leg = Legislator(term,
                             chamber,
                             district,
                             full_name,
                             party=party,
                             photo_url=photo_url,
                             url=member_url)

        leg.add_source(urlescape(member_url))

        # add addresses
        for atype, text in (('capitol', 'Capitol address'),
                            ('district', 'District address')):
            aspan = root.xpath("//span[. = '%s:']" % text)
            addr = ''
            phone = None
            if aspan:
                # cycle through brs
                addr = aspan[0].tail.strip()
                elem = aspan[0].getnext()
                while elem is not None and elem.tag == 'br':
                    if elem.tail:
                        if not phone_re.match(elem.tail):
                            addr += "\n" + elem.tail
                        else:
                            phone = elem.tail
                    elem = elem.getnext()
                # now add the addresses
                leg.add_office(atype, text, address=addr, phone=phone)

        # add committees
        comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                              '/following-sibling::div'
                              '[@class="rcwcontent"]')[0]

        for link in comm_div.xpath('*/a'):
            name = link.text

            if '(Vice Chair)' in name:
                mtype = 'vice chair'
            elif '(Chair)' in name:
                mtype = 'chair'
            else:
                mtype = 'member'

            name = clean_committee_name(link.text)

            # There's no easy way to determine whether a committee
            # is joint or not using the mobile legislator directory
            # (without grabbing a whole bunch of pages, at least)
            # so for now we will hard-code the one broken case
            if (name == "Oversight of HHS Eligibility System"
                    and term == '82'):
                comm_chamber = 'joint'
            else:
                comm_chamber = chamber

            if name.startswith('Appropriations-S/C on '):
                sub = name.replace('Appropriations-S/C on ', '')
                leg.add_role('committee member',
                             term,
                             chamber=comm_chamber,
                             committee='Appropriations',
                             subcommittee=sub,
                             position=mtype)
            else:
                leg.add_role('committee member',
                             term,
                             chamber=comm_chamber,
                             committee=name,
                             position=mtype)

        if type == 'Lt. Gov.':
            self.save_object(leg)
        else:
            if district:
                self.save_legislator(leg)
示例#8
0
    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)

            sdiv = root.xpath('//div[@class="subtitle"]')[0]
            table = sdiv.getnext()

            photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1' '_imgMember"]')[0].attrib["src"]

            td = table.xpath('//td[@valign="top"]')[0]

            type = td.xpath("string(//div[1]/strong)").strip()

            full_name = td.xpath("string(//div[2]/strong)").strip()
            full_name = re.sub(r"\s+", " ", full_name)

            district = td.xpath("string(//div[3])").strip()
            district = district.replace("District ", "")

            party = td.xpath("string(//div[4])").strip()[0]
            if party == "D":
                party = "Democratic"
            elif party == "R":
                party = "Republican"

            if type == "Lt. Gov.":
                leg = Person(full_name)
                leg.add_role("Lt. Governor", term, party=party)
            else:
                leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url)

            leg.add_source(urlescape(member_url))

            # add addresses
            for atype, text in (("capitol", "Capitol address"), ("district", "District address")):
                aspan = root.xpath("//span[. = '%s:']" % text)
                addr = ""
                phone = None
                if aspan:
                    # cycle through brs
                    addr = aspan[0].tail.strip()
                    elem = aspan[0].getnext()
                    while elem is not None and elem.tag == "br":
                        if elem.tail:
                            if not phone_re.match(elem.tail):
                                addr += "\n" + elem.tail
                            else:
                                phone = elem.tail
                        elem = elem.getnext()
                    # now add the addresses
                    leg.add_office(atype, text, address=addr, phone=phone)

            # add committees
            comm_div = root.xpath(
                '//div[string() = "Committee Membership:"]' "/following-sibling::div" '[@class="rcwcontent"]'
            )[0]

            for link in comm_div.xpath("*/a"):
                name = link.text

                if "(Vice Chair)" in name:
                    mtype = "vice chair"
                elif "(Chair)" in name:
                    mtype = "chair"
                else:
                    mtype = "member"

                name = clean_committee_name(link.text)

                # There's no easy way to determine whether a committee
                # is joint or not using the mobile legislator directory
                # (without grabbing a whole bunch of pages, at least)
                # so for now we will hard-code the one broken case
                if name == "Oversight of HHS Eligibility System" and term == "82":
                    comm_chamber = "joint"
                else:
                    comm_chamber = chamber

                if name.startswith("Appropriations-S/C on "):
                    sub = name.replace("Appropriations-S/C on ", "")
                    leg.add_role(
                        "committee member",
                        term,
                        chamber=comm_chamber,
                        committee="Appropriations",
                        subcommittee=sub,
                        position=mtype,
                    )
                else:
                    leg.add_role("committee member", term, chamber=comm_chamber, committee=name, position=mtype)

            if type == "Lt. Gov.":
                self.save_object(leg)
            else:
                if district:
                    self.save_legislator(leg)
    def scrape(self, term, chambers):
        # The mayor doesn't sit on council.
        url = 'http://www.phila.gov/'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        # The mayor's name doesn't appear on the mayor's page!
        name  = re.search('Mayor (.+)', doc.xpath('//title/text()')[0].strip()).group(1)
        mayor = Person(name)
        mayor.add_source(url)

        url = 'http://www.phila.gov/mayor/'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        lines   = map(clean_string, doc.xpath('//div[contains(text(),"Mailing Address")]/following-sibling::text()')[1:])
        address = '\n'.join(lines)
        phone   = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Phone")]/following-sibling::text()[1]')[0]).groups())
        fax     = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Fax")]/following-sibling::text()[1]')[0]).groups())
        email   = clean_string(doc.xpath('//strong[contains(text(),"Email")]/following-sibling::text()[1]')[0])

        mayor.update(dict(url=url, email=email))
        mayor.add_office('capitol', 'Office of the Mayor', address=address, phone=phone, fax=fax)
        mayor.add_role('Mayor', term)
        mayor.add_source(url)

        self.save_object(mayor)



        council_url = 'http://philadelphiacitycouncil.net/council-members/'
        doc = lxml.html.fromstring(self.urlopen(council_url))
        doc.make_links_absolute(council_url)

        urls = set(doc.xpath('//a[contains(@href, "/council-members/council")]/@href'))
        assert len(urls) <= 17, 'expected 17 unique councilmember URLs, found %d' % len(urls)

        for url in urls:
            doc = lxml.html.fromstring(self.urlopen(url))
            doc.make_links_absolute(url)

            optional  = dict() # fields not all legislators will have
            name      = []
            roles     = []
            lines     = []
            phone1    = None
            phone2    = None
            fax       = None
            district  = 'At-Large' # default
            photo_url = (
                doc.xpath('//img[contains(@class, "size-full")]/@src') or
                doc.xpath('//img[contains(@class, "size-medium")]/@src') or
                doc.xpath('//img[contains(@class, "size-thumbnail")]/@src')
            )[0]

            # That's an en dash, not a hyphen.
            parts = re.split(u'[,–]', doc.xpath('//h3/text()')[0])
            for index, part in enumerate(filter(None, parts)):
                part = clean_string(part)
                if index == 0:
                    if 'Councilman' in part:
                        optional['gender'] = 'Male'
                    elif 'Councilwoman' in part:
                        optional['gender'] = 'Female'
                    elif 'Council President' in part:
                        roles.append('Council President')
                    part = re.sub('^Council(?:man|woman| President)\s+', '', part)
                    name.append(part)
                elif part in ('Jr.', 'Sr.'):
                    name.append(part)
                elif 'District' in part:
                    district = part
                else:
                    roles.append(part)
            name = ', '.join(name)

            contact_url = doc.xpath('//a[text()="Contact"]/@href')[0]
            doc = lxml.html.fromstring(self.urlopen(contact_url))
            doc.make_links_absolute(contact_url)

            # @todo email, second office, personal_url are sometimes in another paragraph.
            if len(doc.xpath('//div[@class="post-entry"]/p')) > 1:
                self.logger.warning('Skipped paragraphs:\n' + '\n'.join(lxml.html.tostring(html) for html in doc.xpath('//div[@class="post-entry"]/p[position()>1]')))

            parts = doc.xpath('//div[@class="post-entry"]/p[position()=1]//text()') or doc.xpath('//div[@class="post-entry"]//text()')
            parts = map(clean_string, parts)
            for part in filter(None, parts):
                if re.match(r'^City Hall', part):
                    lines.append('City Hall, Room %s' % re.search('Room (\d+)', part).group(1))
                elif re.match(r'^FAX:', part, re.I):
                    fax = '-'.join(tel_regex.search(part).groups())
                elif tel_regex.search(part):
                    if phone1:
                        self.logger.warning('Already have phone numbers for one office: ' + part)
                    else:
                        phones = tel_regex.findall(part)
                        phone1 = '-'.join(phones[0])
                        if len(phones) == 2:
                            phone2 = '-'.join(phones[1])
                        else:
                            phone2 = phone1[:8] + re.search(r'(?: or |/)(\d{4})$', parts[2]).group(1)
                elif '@' in part:
                    optional['email'] = re.search('\S+@\S+', part).group()
                elif re.match(r'^(?:, )?Philadelphia, PA(?: 19107(?:-3290)?)?$', part):
                    pass
                else: # @todo second office is sometimes in the same paragraph.
                    self.logger.warning('Skipped: ' + part)

            # Some Councilmembers have no zip code or only a 5-digit zip code.
            # All that changes between them is a room number.
            lines.append('Philadelphia, PA 19107-3290')
            address = '\n'.join(lines)

            legislator = Legislator(term, 'upper', district, name, url=url, photo_url=photo_url, party=None)
            legislator.update(optional)
            legislator.add_office('capitol', 'Council Office', address=address, phone=phone1, secondary_phone=phone2, fax=fax)
            legislator.add_source(url)

            for role in roles:
                legislator.add_role(role, term)

            self.save_legislator(legislator)