Python Person примеры использования

Язык программирования: Python

Пространство имен/Пакет: billy.scrape.legislators

Класс/Тип: Person

Примеров на hotexamples.com: 9

Python Person - 9 примеров найдено. Это лучшие примеры Python кода для billy.scrape.legislators.Person, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Person(3)

add_role(3)

add_source(3)

update(2)

add_office(1)

Пример #1

Показать файл

Файл: legislators.py Проект: QuorumUS/openstates

    def _scrape_upper(self, roster_page, term):
        member_urls = roster_page.xpath('(//table[caption])[1]//a/@href')
        # Sort by district for easier spotting of omissions:
        member_urls.sort(key=lambda url: int(re.search(
            r'\d+(?=\.htm)', url).group()))

        for member_url in member_urls:
            self._scrape_senator(member_url, term)

        # Handle Lt. Governor (President of the Senate) separately:
        url = 'http://www.senate.state.tx.us/75r/LtGov/Ltgov.htm'
        page = lxml.html.fromstring(self.get(url).text)
        name = page.xpath('//div[@class="memtitle"]/text()')[0] \
                   .replace('Lieutenant Governor', '').strip()

        # A safe assumption for lack of information on official member page or
        # party listings:
        party = 'Republican'

        lt_governor = Person(name)
        lt_governor.add_role('Lt. Governor', term, party=party)
        lt_governor.add_source(url)
        self.save_legislator(lt_governor)

Пример #2

Показать файл

    def _scrape_upper(self, roster_page, term):
        member_urls = roster_page.xpath('(//table[caption])[1]//a/@href')
        # Sort by district for easier spotting of omissions:
        member_urls.sort(
            key=lambda url: int(re.search(r'\d+(?=\.htm)', url).group()))

        for member_url in member_urls:
            self._scrape_senator(member_url, term)

        # Handle Lt. Governor (President of the Senate) separately:
        url = 'http://www.senate.state.tx.us/75r/LtGov/Ltgov.htm'
        page = lxml.html.fromstring(self.get(url).text)
        name = page.xpath('//div[@class="memtitle"]/text()')[0] \
                   .replace('Lieutenant Governor', '').strip()

        # A safe assumption for lack of information on official member page or
        # party listings:
        party = 'Republican'

        lt_governor = Person(name)
        lt_governor.add_role('Lt. Governor', term, party=party)
        lt_governor.add_source(url)
        self.save_legislator(lt_governor)

Пример #3

Показать файл

    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)

            sdiv = root.xpath('//div[@class="subtitle"]')[0]
            table = sdiv.getnext()

            photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                    '_imgMember"]')[0].attrib['src']

            td = table.xpath('//td[@valign="top"]')[0]

            type = td.xpath('string(//div[1]/strong)').strip()

            full_name = td.xpath('string(//div[2]/strong)').strip()
            full_name = re.sub(r'\s+', ' ', full_name)

            district = td.xpath('string(//div[3])').strip()
            district = district.replace('District ', '')

            addrs = {}
            for atype, text in (('capital_address', 'Capitol address:'),
                                ('district_address', 'District address:')):
                aspan = root.xpath("//span[. = '%s']" % text)
                addrs[atype] = None

                if aspan:
                    addrs[atype] = aspan[0].tail
                    elem = aspan[0].getnext()
                    while elem is not None and elem.tag == 'br':
                        if elem.tail:
                            addrs[atype] += "\n" + elem.tail
                        elem = elem.getnext()

            party = td.xpath('string(//div[4])').strip()[0]
            if party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'

            if type == 'Lt. Gov.':
                leg = Person(full_name)
                leg.add_role('Lt. Governor', term, party=party, **addrs)
            else:
                leg = Legislator(term,
                                 chamber,
                                 district,
                                 full_name,
                                 party=party,
                                 photo_url=photo_url,
                                 **addrs)

            leg.add_source(urlescape(member_url))

            comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                                  '/following-sibling::div'
                                  '[@class="rcwcontent"]')[0]

            for link in comm_div.xpath('*/a'):
                name = link.text

                if '(Vice Chair)' in name:
                    mtype = 'vice chair'
                elif '(Chair)' in name:
                    mtype = 'chair'
                else:
                    mtype = 'member'

                name = clean_committee_name(link.text)

                # There's no easy way to determine whether a committee
                # is joint or not using the mobile legislator directory
                # (without grabbing a whole bunch of pages, at least)
                # so for now we will hard-code the one broken case
                if (name == "Oversight of HHS Eligibility System"
                        and term == '82'):
                    comm_chamber = 'joint'
                else:
                    comm_chamber = chamber

                if name.startswith('Appropriations-S/C on '):
                    sub = name.replace('Appropriations-S/C on ', '')
                    leg.add_role('committee member',
                                 term,
                                 chamber=comm_chamber,
                                 committee='Appropriations',
                                 subcommittee=sub,
                                 position=mtype)
                else:
                    leg.add_role('committee member',
                                 term,
                                 chamber=comm_chamber,
                                 committee=name,
                                 position=mtype)

            if type == 'Lt. Gov.':
                self.save_person(leg)
            else:
                if district:
                    self.save_legislator(leg)

Пример #4

Показать файл

Файл: legislators.py Проект: JoeGermuska/openstates

    def scrape_member(self, chamber, term, member_url):
        page = self.urlopen(member_url)
        root = lxml.html.fromstring(page)
        root.make_links_absolute(member_url)

        sdiv = root.xpath('//div[@class="subtitle"]')[0]
        table = sdiv.getnext()

        photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                '_imgMember"]')[0].attrib['src']

        td = table.xpath('//td[@valign="top"]')[0]

        type = td.xpath('string(//div[1]/strong)').strip()

        full_name = td.xpath('string(//div[2]/strong)').strip()
        full_name = re.sub(r'\s+', ' ', full_name)

        district = td.xpath('string(//div[3])').strip()
        district = district.replace('District ', '')

        party = td.xpath('string(//div[4])').strip()[0]
        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'

        if type == 'Lt. Gov.':
            leg = Person(full_name)
            leg.add_role('Lt. Governor', term, party=party)
        else:
            leg = Legislator(term, chamber, district, full_name,
                             party=party, photo_url=photo_url,
                             url=member_url)

        leg.add_source(urlescape(member_url))

        # add addresses
        for atype, text in (('capitol', 'Capitol address'),
                            ('district', 'District address')):
            aspan = root.xpath("//span[. = '%s:']" % text)
            addr = ''
            phone = None
            if aspan:
                # cycle through brs
                addr = aspan[0].tail.strip()
                elem = aspan[0].getnext()
                while elem is not None and elem.tag == 'br':
                    if elem.tail:
                        if not phone_re.match(elem.tail):
                            addr += "\n" + elem.tail
                        else:
                            phone = elem.tail
                    elem = elem.getnext()
                # now add the addresses
                leg.add_office(atype, text, address=addr, phone=phone)

        # add committees
        comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                              '/following-sibling::div'
                              '[@class="rcwcontent"]')[0]

        for link in comm_div.xpath('*/a'):
            name = link.text

            if '(Vice Chair)' in name:
                mtype = 'vice chair'
            elif '(Chair)' in name:
                mtype = 'chair'
            else:
                mtype = 'member'

            name = clean_committee_name(link.text)

            # There's no easy way to determine whether a committee
            # is joint or not using the mobile legislator directory
            # (without grabbing a whole bunch of pages, at least)
            # so for now we will hard-code the one broken case
            if (name == "Oversight of HHS Eligibility System" and
                term == '82'):
                comm_chamber = 'joint'
            else:
                comm_chamber = chamber

            if name.startswith('Appropriations-S/C on '):
                sub = name.replace('Appropriations-S/C on ', '')
                leg.add_role('committee member', term,
                             chamber=comm_chamber,
                             committee='Appropriations',
                             subcommittee=sub,
                             position=mtype)
            else:
                leg.add_role('committee member', term,
                             chamber=comm_chamber,
                             committee=name,
                             position=mtype)

        if type == 'Lt. Gov.':
            self.save_object(leg)
        else:
            if district:
                self.save_legislator(leg)

Пример #5

Показать файл

Файл: legislators.py Проект: opengovernment/opengovernment-local

    def scrape(self, term, chambers):
        # The mayor doesn't sit on council.
        url = 'http://www.phila.gov/'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        # The mayor's name doesn't appear on the mayor's page!
        full_name  = re.search('Mayor (.+)', doc.xpath('//title/text()')[0].strip()).group(1)
        first_name, middle_name, last_name = parse_full_name(full_name)
        mayor = Person(full_name, first_name, last_name, middle_name)
        mayor.add_source(url)

        url = 'http://www.phila.gov/mayor/'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        lines   = map(clean_string, doc.xpath('//div[contains(text(),"Mailing Address")]/following-sibling::text()')[1:])
        address = '\n'.join(lines)
        phone   = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Phone")]/following-sibling::text()[1]')[0]).groups())
        fax     = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Fax")]/following-sibling::text()[1]')[0]).groups())
        email   = clean_string(doc.xpath('//strong[contains(text(),"Email")]/following-sibling::text()[1]')[0])

        mayor.update(dict(url=url, email=email))
        mayor.add_office('capitol', 'Office of the Mayor', address=address, phone=phone, fax=fax)
        mayor.add_role('Mayor', term)
        mayor.add_source(url)

        self.save_object(mayor)



        council_url = 'http://philadelphiacitycouncil.net/council-members/'
        doc = lxml.html.fromstring(self.urlopen(council_url))
        doc.make_links_absolute(council_url)

        urls = set(doc.xpath('//a[contains(@href, "/council-members/council")]/@href'))
        assert len(urls) <= 17, 'expected 17 unique councilmember URLs, found %d' % len(urls)

        for url in urls:
            doc = lxml.html.fromstring(self.urlopen(url))
            doc.make_links_absolute(url)

            optional  = dict() # fields not all legislators will have
            full_name = []
            first_name  = ''
            middle_name = ''
            last_name   = ''
            suffixes    = ''
            roles     = []
            lines     = []
            lines_office2 = []
            has_office2 = bool(False)
            reached_contact_form = bool(False)
            phone1    = None
            phone1_office2 = None
            phone2    = None
            phone2_office2 = None
            fax       = None
            fax_office2 = None
            office_name = None
            district  = 'At-Large' # default
            photo_url = (
                doc.xpath('//img[contains(@title, "brian picture")]/@src') or  # Special case for BRIAN J. O’NEILL
                doc.xpath('//img[contains(@class, "size-full")]/@src') or
                doc.xpath('//img[contains(@class, "size-medium")]/@src') or
                doc.xpath('//img[contains(@class, "size-thumbnail")]/@src')
            )[0]

            # That's an en dash, not a hyphen.
            parts = re.split(u'[,–]', doc.xpath('//h3/text()')[0])
            for index, part in enumerate(filter(None, parts)):
                part = clean_string(part)
                if index == 0:
                    if 'Councilman' in part:
                        optional['gender'] = 'Male'
                    elif 'Councilwoman' in part:
                        optional['gender'] = 'Female'
                    elif 'Council President' in part:
                        roles.append('Council President')
                    part = re.sub('^Council(?:man|woman| President)\s+', '', part)
                    full_name.append(part)
		    first_name, middle_name, last_name = parse_full_name(full_name[0])
                elif part in ('Jr.', 'Sr.'):
                    full_name.append(part)
		    suffixes = part
                elif 'District' in part:
                    district = part
                else:
                    roles.append(part)
            full_name = ', '.join(full_name)

            contact_url = doc.xpath('//a[text()="Contact"]/@href')[0]
            doc = lxml.html.fromstring(self.urlopen(contact_url))
            doc.make_links_absolute(contact_url)

            # @todo email, personal_url are sometimes in another paragraph.

            parts = doc.xpath('//div[@class="post-entry"]//text()')
            parts = map(clean_string, parts)
	    consuming_address_lines = bool(False)
            for part in filter(None, parts):
 
		# Special case for Curtis Jones Jr.
                if re.match(r'^Local Office:', part):
		    consuming_address_lines = True
                    has_office2 = True
		    office_name = 'Local Office'

                if re.match(r'City Hall Office', part) or re.match(r'^Hours', part) or re.match(r'.*facebook', part) or re.match(r'.*twitter', part) or reached_contact_form:
		    continue

                elif re.match(r'^Contact Council.*man', part) or re.match(r'^Contact CMAL', part):
		    reached_contact_form = True
                    continue

                elif re.match(r'^City Hall.+Room', part):
		    consuming_address_lines = True
                    lines.append(part)

                elif re.match(r'^FAX:', part, re.I) or re.match(r'^F:', part, re.I):
		    consuming_address_lines = False
                    if has_office2 and fax_office2 == None:
               		fax_office2 = '-'.join(tel_regex.search(part).groups())
                    elif fax == None:
               		fax = '-'.join(tel_regex.search(part).groups())

                elif tel_regex.search(part):
		    consuming_address_lines = False
                    if has_office2 and phone1_office2 == None and phone2_office2 == None:
			phone1_office2, phone2_office2 = parse_phones(part)
                    elif phone1 == None and phone2 == None:
			phone1, phone2 = parse_phones(part)

                elif '@' in part:
		    consuming_address_lines = False
                    optional['email'] = re.search('\S+@\S+', part).group()

                elif re.match(r'^Neighborhood Office.*', part):
		    consuming_address_lines = False
                    has_office2 = True

                elif re.match(r'.*Office.*', part) or re.match(r'.*Heroes Hall.*', part):

		    # Special case for Curtis Jones Jr.
		    if re.match(r'.*Local Office.*', part):
			continue

		    if len(lines_office2) > 0:
			consuming_address_lines = False
		    else:
			consuming_address_lines = True
			office_name =  string.strip(part, ':;,.')

                elif consuming_address_lines:
                    if has_office2:
                    	lines_office2.append(cleanup_address(part, False))
                    else:
			lines.append(cleanup_address(part))

                elif re.match(r'^(?:, )?Philadelphia, PA(?: 19107(?:-3290)?)?$', part):
                    pass

                else:
                    self.logger.warning('Skipped: ' + part)

            # Some Councilmembers have no zip code or only a 5-digit zip code.
            # All that changes between them is a room number.
            address = '\n'.join(lines)
            address_office2 = '\n'.join(lines_office2)

            legislator = Legislator(term, 'upper', district, full_name, first_name, last_name, middle_name, suffixes=suffixes, url=url, photo_url=photo_url, party=None)
            legislator.update(optional)

	    if re.search('.*\S.*', address):
      		legislator.add_office('capitol', 'City Hall Office', address=address, phone=phone1, secondary_phone=phone2, fax=fax)

	    if re.search('.*\S.*', address_office2):
      		legislator.add_office('district', office_name, address=address_office2, phone=phone1_office2, secondary_phone=phone2_office2, fax=fax_office2)

            legislator.add_source(url)

            for role in roles:
                legislator.add_role(role, term)

            self.save_legislator(legislator)

Пример #6

Показать файл

Файл: legislators.py Проект: IanWhalen/openstates

    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)

            sdiv = root.xpath('//div[@class="subtitle"]')[0]
            table = sdiv.getnext()

            photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                    '_imgMember"]')[0].attrib['src']

            td = table.xpath('//td[@valign="top"]')[0]

            type = td.xpath('string(//div[1]/strong)').strip()

            full_name = td.xpath('string(//div[2]/strong)').strip()
            full_name = re.sub(r'\s+', ' ', full_name)

            district = td.xpath('string(//div[3])').strip()
            district = district.replace('District ', '')

            addrs = {}
            for atype, text in (('capital_address', 'Capitol address:'),
                                ('district_address', 'District address:')):
                aspan = root.xpath("//span[. = '%s']" % text)
                addrs[atype] = None

                if aspan:
                    addrs[atype] = aspan[0].tail
                    elem = aspan[0].getnext()
                    while elem is not None and elem.tag == 'br':
                        if elem.tail:
                            addrs[atype] += "\n" + elem.tail
                        elem = elem.getnext()

            party = td.xpath('string(//div[4])').strip()[0]
            if party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'

            if type == 'Lt. Gov.':
                leg = Person(full_name)
                leg.add_role('Lt. Governor', term, party=party, **addrs)
            else:
                leg = Legislator(term, chamber, district, full_name,
                                 party=party, photo_url=photo_url,
                                 **addrs)

            leg.add_source(member_url)

            comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                                  '/following-sibling::div'
                                  '[@class="rcwcontent"]')[0]

            for link in comm_div.xpath('*/a'):
                name = link.text

                if '(Vice Chair)' in name:
                    mtype = 'vice chair'
                elif '(Chair)' in name:
                    mtype = 'chair'
                else:
                    mtype = 'member'

                name = clean_committee_name(link.text)

                if name.startswith('Appropriations-S/C on '):
                    sub = name.replace('Appropriations-S/C on ', '')
                    leg.add_role('committee member', term,
                                 chamber=chamber,
                                 committee='Appropriations',
                                 subcommittee=sub,
                                 position=mtype)
                else:
                    leg.add_role('committee member', term,
                                 chamber=chamber,
                                 committee=name,
                                 position=mtype)

            if type == 'Lt. Gov.':
                self.save_person(leg)
            else:
                if district:
                    self.save_legislator(leg)

Пример #7

Показать файл

Файл: legislators.py Проект: tyrocca/openstates

    def scrape_member(self, chamber, term, member_url):
        page = self.get(member_url).text
        root = lxml.html.fromstring(page)
        root.make_links_absolute(member_url)

        sdiv = root.xpath('//div[@class="subtitle"]')[0]
        table = sdiv.getnext()

        photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                '_imgMember"]')[0].attrib['src']

        td = table.xpath('//td[@valign="top"]')[0]

        type = td.xpath('string(//div[1]/strong)').strip()

        full_name = td.xpath('//div/strong/text()')
        full_name = [re.sub(r'\s+', ' ', x).strip() for x in full_name]
        if full_name == []:
            self.warning("ERROR: CAN'T GET FULL NAME")
            return

        full_name = full_name[-1]

        district = td.xpath('string(//div[3])').strip()
        district = district.replace('District ', '')

        party = td.xpath('string(//div[4])').strip()[0]
        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'

        if type == 'Lt. Gov.':
            leg = Person(full_name)
            leg.add_role('Lt. Governor', term, party=party)
        else:
            leg = Legislator(term,
                             chamber,
                             district,
                             full_name,
                             party=party,
                             photo_url=photo_url,
                             url=member_url)

        leg.add_source(urlescape(member_url))

        # add addresses
        for atype, text in (('capitol', 'Capitol address'),
                            ('district', 'District address')):
            aspan = root.xpath("//span[. = '%s:']" % text)
            addr = ''
            phone = None
            if aspan:
                # cycle through brs
                addr = aspan[0].tail.strip()
                elem = aspan[0].getnext()
                while elem is not None and elem.tag == 'br':
                    if elem.tail:
                        if not phone_re.match(elem.tail):
                            addr += "\n" + elem.tail
                        else:
                            phone = elem.tail
                    elem = elem.getnext()
                # now add the addresses
                leg.add_office(atype, text, address=addr, phone=phone)

        # add committees
        comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                              '/following-sibling::div'
                              '[@class="rcwcontent"]')[0]

        for link in comm_div.xpath('*/a'):
            name = link.text

            if '(Vice Chair)' in name:
                mtype = 'vice chair'
            elif '(Chair)' in name:
                mtype = 'chair'
            else:
                mtype = 'member'

            name = clean_committee_name(link.text)

            # There's no easy way to determine whether a committee
            # is joint or not using the mobile legislator directory
            # (without grabbing a whole bunch of pages, at least)
            # so for now we will hard-code the one broken case
            if (name == "Oversight of HHS Eligibility System"
                    and term == '82'):
                comm_chamber = 'joint'
            else:
                comm_chamber = chamber

            if name.startswith('Appropriations-S/C on '):
                sub = name.replace('Appropriations-S/C on ', '')
                leg.add_role('committee member',
                             term,
                             chamber=comm_chamber,
                             committee='Appropriations',
                             subcommittee=sub,
                             position=mtype)
            else:
                leg.add_role('committee member',
                             term,
                             chamber=comm_chamber,
                             committee=name,
                             position=mtype)

        if type == 'Lt. Gov.':
            self.save_object(leg)
        else:
            if district:
                self.save_legislator(leg)

Пример #8

Показать файл

Файл: legislators.py Проект: VersaHQ/openstates

    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)

            sdiv = root.xpath('//div[@class="subtitle"]')[0]
            table = sdiv.getnext()

            photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1' '_imgMember"]')[0].attrib["src"]

            td = table.xpath('//td[@valign="top"]')[0]

            type = td.xpath("string(//div[1]/strong)").strip()

            full_name = td.xpath("string(//div[2]/strong)").strip()
            full_name = re.sub(r"\s+", " ", full_name)

            district = td.xpath("string(//div[3])").strip()
            district = district.replace("District ", "")

            party = td.xpath("string(//div[4])").strip()[0]
            if party == "D":
                party = "Democratic"
            elif party == "R":
                party = "Republican"

            if type == "Lt. Gov.":
                leg = Person(full_name)
                leg.add_role("Lt. Governor", term, party=party)
            else:
                leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url)

            leg.add_source(urlescape(member_url))

            # add addresses
            for atype, text in (("capitol", "Capitol address"), ("district", "District address")):
                aspan = root.xpath("//span[. = '%s:']" % text)
                addr = ""
                phone = None
                if aspan:
                    # cycle through brs
                    addr = aspan[0].tail.strip()
                    elem = aspan[0].getnext()
                    while elem is not None and elem.tag == "br":
                        if elem.tail:
                            if not phone_re.match(elem.tail):
                                addr += "\n" + elem.tail
                            else:
                                phone = elem.tail
                        elem = elem.getnext()
                    # now add the addresses
                    leg.add_office(atype, text, address=addr, phone=phone)

            # add committees
            comm_div = root.xpath(
                '//div[string() = "Committee Membership:"]' "/following-sibling::div" '[@class="rcwcontent"]'
            )[0]

            for link in comm_div.xpath("*/a"):
                name = link.text

                if "(Vice Chair)" in name:
                    mtype = "vice chair"
                elif "(Chair)" in name:
                    mtype = "chair"
                else:
                    mtype = "member"

                name = clean_committee_name(link.text)

                # There's no easy way to determine whether a committee
                # is joint or not using the mobile legislator directory
                # (without grabbing a whole bunch of pages, at least)
                # so for now we will hard-code the one broken case
                if name == "Oversight of HHS Eligibility System" and term == "82":
                    comm_chamber = "joint"
                else:
                    comm_chamber = chamber

                if name.startswith("Appropriations-S/C on "):
                    sub = name.replace("Appropriations-S/C on ", "")
                    leg.add_role(
                        "committee member",
                        term,
                        chamber=comm_chamber,
                        committee="Appropriations",
                        subcommittee=sub,
                        position=mtype,
                    )
                else:
                    leg.add_role("committee member", term, chamber=comm_chamber, committee=name, position=mtype)

            if type == "Lt. Gov.":
                self.save_object(leg)
            else:
                if district:
                    self.save_legislator(leg)

Пример #9

Показать файл

Файл: legislators.py Проект: janbenson/opengovernment-local

    def scrape(self, term, chambers):
        # The mayor doesn't sit on council.
        url = 'http://www.phila.gov/'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        # The mayor's name doesn't appear on the mayor's page!
        name  = re.search('Mayor (.+)', doc.xpath('//title/text()')[0].strip()).group(1)
        mayor = Person(name)
        mayor.add_source(url)

        url = 'http://www.phila.gov/mayor/'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        lines   = map(clean_string, doc.xpath('//div[contains(text(),"Mailing Address")]/following-sibling::text()')[1:])
        address = '\n'.join(lines)
        phone   = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Phone")]/following-sibling::text()[1]')[0]).groups())
        fax     = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Fax")]/following-sibling::text()[1]')[0]).groups())
        email   = clean_string(doc.xpath('//strong[contains(text(),"Email")]/following-sibling::text()[1]')[0])

        mayor.update(dict(url=url, email=email))
        mayor.add_office('capitol', 'Office of the Mayor', address=address, phone=phone, fax=fax)
        mayor.add_role('Mayor', term)
        mayor.add_source(url)

        self.save_object(mayor)



        council_url = 'http://philadelphiacitycouncil.net/council-members/'
        doc = lxml.html.fromstring(self.urlopen(council_url))
        doc.make_links_absolute(council_url)

        urls = set(doc.xpath('//a[contains(@href, "/council-members/council")]/@href'))
        assert len(urls) <= 17, 'expected 17 unique councilmember URLs, found %d' % len(urls)

        for url in urls:
            doc = lxml.html.fromstring(self.urlopen(url))
            doc.make_links_absolute(url)

            optional  = dict() # fields not all legislators will have
            name      = []
            roles     = []
            lines     = []
            phone1    = None
            phone2    = None
            fax       = None
            district  = 'At-Large' # default
            photo_url = (
                doc.xpath('//img[contains(@class, "size-full")]/@src') or
                doc.xpath('//img[contains(@class, "size-medium")]/@src') or
                doc.xpath('//img[contains(@class, "size-thumbnail")]/@src')
            )[0]

            # That's an en dash, not a hyphen.
            parts = re.split(u'[,–]', doc.xpath('//h3/text()')[0])
            for index, part in enumerate(filter(None, parts)):
                part = clean_string(part)
                if index == 0:
                    if 'Councilman' in part:
                        optional['gender'] = 'Male'
                    elif 'Councilwoman' in part:
                        optional['gender'] = 'Female'
                    elif 'Council President' in part:
                        roles.append('Council President')
                    part = re.sub('^Council(?:man|woman| President)\s+', '', part)
                    name.append(part)
                elif part in ('Jr.', 'Sr.'):
                    name.append(part)
                elif 'District' in part:
                    district = part
                else:
                    roles.append(part)
            name = ', '.join(name)

            contact_url = doc.xpath('//a[text()="Contact"]/@href')[0]
            doc = lxml.html.fromstring(self.urlopen(contact_url))
            doc.make_links_absolute(contact_url)

            # @todo email, second office, personal_url are sometimes in another paragraph.
            if len(doc.xpath('//div[@class="post-entry"]/p')) > 1:
                self.logger.warning('Skipped paragraphs:\n' + '\n'.join(lxml.html.tostring(html) for html in doc.xpath('//div[@class="post-entry"]/p[position()>1]')))

            parts = doc.xpath('//div[@class="post-entry"]/p[position()=1]//text()') or doc.xpath('//div[@class="post-entry"]//text()')
            parts = map(clean_string, parts)
            for part in filter(None, parts):
                if re.match(r'^City Hall', part):
                    lines.append('City Hall, Room %s' % re.search('Room (\d+)', part).group(1))
                elif re.match(r'^FAX:', part, re.I):
                    fax = '-'.join(tel_regex.search(part).groups())
                elif tel_regex.search(part):
                    if phone1:
                        self.logger.warning('Already have phone numbers for one office: ' + part)
                    else:
                        phones = tel_regex.findall(part)
                        phone1 = '-'.join(phones[0])
                        if len(phones) == 2:
                            phone2 = '-'.join(phones[1])
                        else:
                            phone2 = phone1[:8] + re.search(r'(?: or |/)(\d{4})$', parts[2]).group(1)
                elif '@' in part:
                    optional['email'] = re.search('\S+@\S+', part).group()
                elif re.match(r'^(?:, )?Philadelphia, PA(?: 19107(?:-3290)?)?$', part):
                    pass
                else: # @todo second office is sometimes in the same paragraph.
                    self.logger.warning('Skipped: ' + part)

            # Some Councilmembers have no zip code or only a 5-digit zip code.
            # All that changes between them is a room number.
            lines.append('Philadelphia, PA 19107-3290')
            address = '\n'.join(lines)

            legislator = Legislator(term, 'upper', district, name, url=url, photo_url=photo_url, party=None)
            legislator.update(optional)
            legislator.add_office('capitol', 'Council Office', address=address, phone=phone1, secondary_phone=phone2, fax=fax)
            legislator.add_source(url)

            for role in roles:
                legislator.add_role(role, term)

            self.save_legislator(legislator)