예제 #1
0
def test_person_add_membership_name():
    p = Person('Leonardo DiCaprio')
    p.add_membership('Academy of Motion Picture Arts and Sciences',
                     role='winner', start_date='2016')
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        'name': 'Academy of Motion Picture Arts and Sciences'}
    assert p._related[0].person_id == p._id
    assert p._related[0].role == 'winner'
    assert p._related[0].start_date == '2016'
예제 #2
0
def test_person_add_membership():
    p = Person('Bob B. Bear')
    p.add_source('http://example.com')
    o = Organization('test org')
    p.add_membership(o, 'member', start_date='2007')
    assert len(p._related) == 1
    p._related[0].validate()
    assert p._related[0].person_id == p._id
    assert p._related[0].organization_id == o._id
    assert p._related[0].start_date == '2007'
예제 #3
0
def test_person_add_membership():
    p = Person('Bob B. Bear')
    p.add_source('http://example.com')
    o = Organization('test org', classification='unknown')
    p.add_membership(o, role='member', start_date='2007')
    assert len(p._related) == 1
    p._related[0].validate()
    assert p._related[0].person_id == p._id
    assert p._related[0].organization_id == o._id
    assert p._related[0].start_date == '2007'
예제 #4
0
def test_person_add_membership_org():
    p = Person('Bob B. Bear')
    p.add_source('http://example.com')
    o = Organization('test org', classification='unknown')
    p.add_membership(o, role='member', start_date='2007', end_date=datetime.date(2015, 5, 8))
    assert len(p._related) == 1
    p._related[0].validate()
    assert p._related[0].person_id == p._id
    assert p._related[0].organization_id == o._id
    assert p._related[0].start_date == '2007'
    assert p._related[0].end_date == datetime.date(2015, 5, 8)
예제 #5
0
    def scrape(self):
        urls = Urls(dict(list=legislators_url), self)

        council = Organization(
            'Temecula City Council',
            classification='legislature')
        council.add_source(urls.list.url)
        yield council

        for tr in urls.list.xpath('//table[2]//tr')[1:]:

            # Parse some attributes.
            name, role = tr.xpath('td/p[1]//font/text()')
            image = tr.xpath('td/img/@src').pop()

            # Create legislator.
            person = Person(name, image=image)

            # Add membership on council.
            memb = person.add_membership(council, role=role)

            # Add email address.
            email, detail_url = tr.xpath('td//a/@href')
            email = email[7:]
            memb.contact_details.append(
                dict(type='email', value=email, note='work'))

            # Add sources.
            person.add_source(urls.list.url)
            person.add_source(detail_url)

            yield person
예제 #6
0
    def scrape(self):
        urls = Urls(dict(list=legislators_url), self)

        council = Organization('Temecula City Council',
                               classification='legislature')
        council.add_source(urls.list.url)
        yield council

        for tr in urls.list.xpath('//table[2]//tr')[1:]:

            # Parse some attributes.
            name, role = tr.xpath('td/p[1]//font/text()')
            image = tr.xpath('td/img/@src').pop()

            # Create legislator.
            person = Person(name, image=image)

            # Add membership on council.
            memb = person.add_membership(council, role=role)

            # Add email address.
            email, detail_url = tr.xpath('td//a/@href')
            email = email[7:]
            memb.contact_details.append(
                dict(type='email', value=email, note='work'))

            # Add sources.
            person.add_source(urls.list.url)
            person.add_source(detail_url)

            yield person
예제 #7
0
    def scrape(self):
        urls = Urls(dict(list=legislators_url), self)

        council = Organization('Boise City Council')
        council.add_source(legislators_url)
        yield council

        xpath = '//div[@id="content"]/div/a/@href'
        people_urls = urls.list.xpath(xpath)

        # SKip the mayor because his page has no name or email.
        people_urls = people_urls[1:]
        for url in people_urls:

            urls.add(detail=url)
            # Parse some attributes.

            image = urls.detail.xpath('//div[@id="content"]/p/img/@src').pop()
            name = urls.detail.xpath('//h1/text()').pop()

            name = name.replace('Council ', '')
            role, _, name = name.partition(' ')

            # Create legislator.
            person = Person(name, image=image)

            # Add membership on council.
            memb = person.add_membership(council, role=role)
            memb.add_source(urls.detail.url)

            # Add email address.
            email_xpath = '//a[contains(@href, "mailto")]/@href'
            email = urls.detail.xpath(email_xpath).pop()[7:]
            memb.contact_details.append(
                dict(type='email', value=email, note='work'))

            # Add sources.
            person.add_source(urls.list.url)
            person.add_source(urls.detail.url)

            yield person
예제 #8
0
    def scrape(self):
        web_scraper = LegistarPersonScraper(
            requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}

        for member, _ in web_scraper.councilMembers():
            name = member['Person Name']['label'].strip()
            web_info[name] = member

        city_council, = [
            body for body in self.bodies()
            if body['BodyName'] == 'City Council'
        ]

        terms = collections.defaultdict(list)

        public_advocates = {  # Match casing to Bill De Blasio as council member
            'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio',
            'The Public Advocate (Ms. James)': 'Letitia James',
        }

        for office in self.body_offices(city_council):
            name = office['OfficeRecordFullName']
            name = public_advocates.get(name, name).strip()

            terms[name].append(office)

            # Add past members (and advocates public)
            if name not in web_info:
                web_info[name] = collections.defaultdict(lambda: None)

        # Check that we have everyone we expect, formatted consistently, in
        # both information arrays. For instance, this will fail if we forget to
        # strip trailing spaces from names on one side or the other (which has
        # the effect of omitting information, such as post, from the scrape).

        assert set(web_info.keys()) == set(terms.keys())

        members = {}

        for member, offices in terms.items():

            p = Person(member)

            web = web_info[member]

            for term in offices:
                role = term['OfficeRecordTitle']

                if role == 'Public Advocate':
                    role = 'Non-Voting Council Member'
                else:
                    role = 'Council Member'

                district = web.get('District', '').replace(' 0', ' ')

                p.add_term(role,
                           'legislature',
                           district=district,
                           start_date=self.toDate(
                               term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

                party = web.get('Political Party')

                if party == 'Democrat':
                    party = 'Democratic'

                if party:
                    p.add_party(party)

                if web.get('Photo'):
                    p.image = web['Photo']

                contact_types = {
                    "City Hall Office": ("address", "City Hall Office"),
                    "City Hall Phone": ("voice", "City Hall Phone"),
                    "Ward Office Phone": ("voice", "Ward Office Phone"),
                    "Ward Office Address": ("address", "Ward Office Address"),
                    "Fax": ("fax", "Fax")
                }

                for contact_type, (type_, _note) in contact_types.items():
                    if web.get(contact_type) and web(contact_type) != 'N/A':
                        p.add_contact_detail(type=type_,
                                             value=web[contact_type],
                                             note=_note)

                if web.get('E-mail'):
                    p.add_contact_detail(type="email",
                                         value=web['E-mail']['url'],
                                         note='E-mail')

                if web.get('Web site'):
                    p.add_link(web['Web site']['url'], note='web site')

                if web.get('Notes'):
                    p.extras = {'Notes': web['Notes']}

                if not p.sources:  # Only add sources once
                    source_urls = self.person_sources_from_office(term)
                    person_api_url, person_web_url = source_urls
                    p.add_source(person_api_url, note='api')
                    p.add_source(person_web_url, note='web')

            members[member] = p

        committee_types = [
            'Committee', 'Inactive Committee', 'Select Committee',
            'Subcommittee', 'Task Force', 'Land Use'
        ]  # Committee on Land Use

        body_types = {
            k: v
            for k, v in self.body_types().items() if k in committee_types
        }

        for body in self.bodies():
            if body['BodyTypeName'] in body_types \
                or body['BodyName'] in ('Legislative Documents Unit',
                                        'Legal and Government Affairs Division'):

                # Skip typo in API data
                if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services':
                    continue

                parent_org = PARENT_ORGS.get(body['BodyName'],
                                             'New York City Council')

                body_name = body['BodyName']

                o = Organization(body_name,
                                 classification='committee',
                                 parent_id={'name': parent_org})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                for office in self.body_offices(body):
                    # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio',
                    # 'Committee Member', None, 'CHAIRPERSON'

                    role = office['OfficeRecordTitle']

                    if role and role.lower() == 'chairperson':
                        role = 'Chairperson'
                    else:
                        role = 'Member'

                    person = office['OfficeRecordFullName']
                    person = public_advocates.get(person, person).strip()

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(o,
                                     role=role,
                                     start_date=self.toDate(
                                         office['OfficeRecordStartDate']),
                                     end_date=self.toDate(
                                         office['OfficeRecordEndDate']))

                yield o

        for p in members.values():
            yield p
예제 #9
0
    def scrape_chamber(self, chamber):
        body = {'lower': 'H', 'upper': 'S'}[chamber]
        url = 'http://www.azleg.gov/MemberRoster/?body=' + body
        page = self.get(url).text

        # there is a bad comment closing tag on this page
        page = page.replace('--!>', '-->')

        root = html.fromstring(page)

        path = '//table//tr'
        roster = root.xpath(path)[1:]
        for row in roster:
            position = ''
            name, district, party, email, room, phone, = row.xpath('td')

            if email.attrib.get('class') == 'vacantmember':
                continue  # Skip any vacant members.

            link = name.xpath('string(a/@href)')
            if len(name) == 1:
                name = name.text_content().strip()
            else:
                position = name.tail.strip()
                name = name[0].text_content().strip()
            if '--' in name:
                name = name.split('--')[0].strip()

            linkpage = self.get(link).text
            linkpage = linkpage.replace('--!>', '-->')
            linkroot = html.fromstring(linkpage)
            linkroot.make_links_absolute(link)

            photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]")

            if len(photos) != 1:
                self.warning('no photo on ' + link)
                photo_url = ''
            else:
                photo_url = photos[0].attrib['src']

            district = district.text_content()
            party = party.text_content().strip()
            email = email.text_content().strip()

            if email.startswith('Email: '):
                email = email.replace('Email: ', '').lower() + '@azleg.gov'
            else:
                email = ''

            party = self.get_party(party)
            room = room.text_content().strip()
            if chamber == 'lower':
                address = "House of Representatives\n"
            else:
                address = "Senate\n"
            address = address + "1700 West Washington\n Room " + room  \
                              + "\nPhoenix, AZ 85007"

            phone = phone.text_content().strip()
            if '602' not in re.findall(r'(\d+)', phone):
                phone = "602-" + phone

            leg = Person(primary_org=chamber,
                         image=photo_url,
                         name=name,
                         district=district,
                         party=party)
            leg.add_contact_detail(type='address',
                                   value=address,
                                   note='Capitol Office')
            leg.add_contact_detail(type='voice',
                                   value=phone,
                                   note='Capitol Office')
            leg.add_party(party=party)
            leg.add_link(link)

            if email:
                leg.add_contact_detail(type='email', value=email)
            if position:
                leg.add_membership(name_or_org=party, role=position)
                # leg.add_role(position, term, chamber=chamber,
                #             district=district, party=party)

            leg.add_source(url)

            # Probably just get this from the committee scraper
            # self.scrape_member_page(link, session, chamber, leg)
            yield leg
예제 #10
0
    def scrape(self):
        body_types = self.body_types()
        city_council, = [body for body in self.bodies()
                         if body["BodyName"] == "City Council"]
        terms = collections.defaultdict(list)

        for office in self.body_offices(city_council):
            if "VACAN" not in office["OfficeRecordFullName"]:
                terms[office["OfficeRecordFullName"].strip()].append(office)

        web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = "https://pittsburgh.legistar.com/People.aspx"
        web_scraper.COMMITTEELIST = "https://pittsburgh.legistar.com/Departments.aspx"

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}
        for member in web_scraper.councilMembers():
            web_info[member["Person Name"]] = member

        members = {}
        for member, offices in terms.items():
            person = Person(member)
            for term in offices:
                role = term["OfficeRecordTitle"]
                person.add_term("Councilmember",
                                "legislature",
                                start_date = self.toDate(term["OfficeRecordStartDate"]),
                                end_date = self.toDate(term["OfficeRecordEndDate"]))

            if member in web_info:
                web = web_info[member]
                if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != "N/A":
                    person.add_contact_detail(type="email",
                                        value=web["E-mail"]["label"],
                                        note="E-mail")

            person_source_data = self.person_sources_from_office(term)
            person_api_url, person_api_response = person_source_data
            person.add_source(person_api_url, note="api")

            if person_api_response["PersonAddress1"]:
                address = (person_api_response["PersonAddress1"] + ", " + person_api_response["PersonCity1"]
                          + ", " + person_api_response["PersonState1"] + " " + person_api_response["PersonZip1"])
                person.add_contact_detail(type="address",
                                    value=address,
                                    note="Office address")

            if person_api_response["PersonPhone"]:
                person.add_contact_detail(type="voice",
                                    value=person_api_response["PersonPhone"],
                                    note="Office phone")

            if person_api_response["PersonWWW"]:
                person.add_contact_detail(type="url",
                                    value=person_api_response["PersonWWW"],
                                    note="District website")

            members[member] = person


        for body in self.bodies():
            if body["BodyTypeId"] == body_types["Committee"]:
                body_name_clean = body["BodyName"].strip()
                organization = Organization(body_name_clean,
                             classification="committee",
                             parent_id={"name" : "Pittsburgh City Council"})

                organization.add_source(self.BASE_URL + "/bodies/{BodyId}".format(**body), note="api")

                for office in self.body_offices(body):
                    role = office["OfficeRecordMemberType"]
                    if role not in ("Vice Chair", "Chair") or role == "Councilmember":
                        role = "Member"

                    person = office["OfficeRecordFullName"].strip()
                    if person in members:
                        person = members[person]
                    else:
                        person = Person(person)

                    person.add_membership(body_name_clean,
                                     role=role,
                                     start_date = self.toDate(office["OfficeRecordStartDate"]),
                                     end_date = self.toDate(office["OfficeRecordEndDate"]))

                yield organization

        for person in members.values():
            yield person
예제 #11
0
    def scrape(self):
        urls = Urls(dict(list=legislators_url), self)

        council = Organization('Denver City Council')
        council.add_source(legislators_url)

        # Get image urls, names, detail urls, and districts.
        image_xpath = '//a[contains(@href, "councildistrict")]/img/@src'
        image_urls = urls.list.xpath(image_xpath)

        name_xpath = '//a[contains(@href, "councildistrict")]'
        names = [a.text_content() for a in urls.list.xpath(name_xpath)][:-1]
        names = filter(None, names)

        person_urls_xpath = '//a[contains(@href, "councildistrict")]/@href'
        person_urls = urls.list.xpath(person_urls_xpath)

        post_ids = []
        xpath = '//a[contains(@href, "councildistrict")]/img/ancestor::td'
        for td in urls.list.xpath(xpath):
            text = td.text_content()
            m = re.search('Council District \d+', text)
            if m:
                post_ids.append(m.group())
                continue
            m = re.search('Council At-Large', text)
            if m:
                post_ids.append('Council At-Large')

        for post_id in post_ids:
            council.add_post(post_id, post_id)
        yield council

        data = zip(image_urls, names, person_urls, post_ids)
        for image_url, name, person_url, post_id in data:

            # Create legislator.
            person = Person(name, image=image_url)

            # Add sources.
            urls.add(detail=person_url)
            person.add_source(urls.list.url, note='list')
            person.add_source(urls.detail.url, note='detail')

            # Add membership on council.
            memb = person.add_membership(council, district=post_id.strip())
            memb.add_source(urls.detail.url)

            xpath = '//div[@id="dnn_column3"]'
            contact_text = urls.detail.xpath(xpath)[0].text_content()

            if not contact_text.strip():
                xpath = '//div[contains(@id, "dnn_RightPaneWide")]'
                contact_text = urls.detail.xpath(xpath)[0].text_content()

            phone_regex = r'\(\d{3}\)[ -]*\d{3}-\d{4}'
            phone = re.search(phone_regex, contact_text).group()
            memb.contact_details.append(
                dict(type='phone', value=phone, note='work'))

            # Add email address.
            email_regex = r'\[email protected]'
            email = re.search(email_regex, contact_text).group()
            memb.contact_details.append(
                dict(type='email', value=email, note='work'))

            yield person
예제 #12
0
    def scrape(self):
        '''
        Scrape the web to create a dict with all active organizations.
        Then, we can access the correct URL for the organization detail page.
        '''
        web_scraper = LegistarPersonScraper(
            requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx'
        web_info = {}

        for _, organizations in web_scraper.councilMembers():
            for organization, _, _ in organizations:
                organization_name = organization['Department Name'][
                    'label'].strip()
                organization_info = organization['Department Name']

                web_info[organization_name] = organization_info

        body_types = self.body_types()

        board_of_directors, = [
            body for body in self.bodies()
            if body['BodyName'] == 'Board of Directors - Regular Board Meeting'
        ]
        board_of_directors["BodyName"] = "Board of Directors"

        terms = collections.defaultdict(list)
        for office in self.body_offices(board_of_directors):
            terms[office['OfficeRecordFullName']].append(office)

        members = {}
        for member, offices in terms.items():
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']

                if role not in {'Board Member', 'non-voting member'}:
                    p.add_term(
                        role,
                        'legislature',
                        start_date=self.toDate(term['OfficeRecordStartDate']),
                        end_date=self.toDate(term['OfficeRecordEndDate']),
                        appointment=True)
                if role != 'Chief Executive Officer':
                    if role == 'non-voting member':
                        member_type = 'Nonvoting Board Member'
                        post = NONVOTING_POSTS.get(member)
                    else:
                        member_type = 'Board Member'
                        post = VOTING_POSTS.get(member)

                    start_date = self.toDate(term['OfficeRecordStartDate'])
                    end_date = self.toDate(term['OfficeRecordEndDate'])
                    board_membership = p.add_term(member_type,
                                                  'legislature',
                                                  district=post,
                                                  start_date=start_date,
                                                  end_date=end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(
                        p.name)
                    if acting_member_end_date and acting_member_end_date <= end_date:
                        board_membership.extras = {'acting': 'true'}

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                organization_name = body['BodyName'].strip()
                o = Organization(organization_name,
                                 classification='committee',
                                 parent_id={'name': 'Board of Directors'})

                organization_info = web_info.get(organization_name, {})
                organization_url = organization_info.get(
                    'url', self.WEB_URL +
                    'https://metro.legistar.com/Departments.aspx')

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(organization_url, note='web')

                for office in self.body_offices(body):
                    role = office['OfficeRecordTitle']

                    if role not in ("Chair", "Vice Chair",
                                    "Chief Executive Officer"):
                        if role == 'non-voting member':
                            role = 'Nonvoting Member'
                        else:
                            role = 'Member'

                    person = office['OfficeRecordFullName']

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    start_date = self.toDate(office['OfficeRecordStartDate'])
                    end_date = self.toDate(office['OfficeRecordEndDate'])
                    membership = p.add_membership(organization_name,
                                                  role=role,
                                                  start_date=start_date,
                                                  end_date=end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(
                        p.name)
                    if acting_member_end_date and acting_member_end_date <= end_date:
                        membership.extras = {'acting': 'true'}

                yield o

        for p in members.values():
            yield p
예제 #13
0
    def scrape(self):
        body_types = self.body_types()

        city_council, = [body for body in self.bodies()
                         if body['BodyName'] == 'City Council ']

        terms = collections.defaultdict(list)

        for office in self.body_offices(city_council):

            if office['OfficeRecordFullName'] != "Granicus BA":
                terms[office['OfficeRecordFullName']].append(office)

        members = {}

        for member, offices in terms.items():

            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']
                p.add_term(role,
                           'legislature',
                           # district = "District {}".format(int(web['District/Office'])),
                           start_date=self.toDate(term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Standing Committees']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name': 'Sacramento City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body),
                             note='web')

                for office in self.body_offices(body):
                    # messed up record for joanna thompson
                    if office['OfficeRecordId'] == 1055:
                        continue

                    role = office['OfficeRecordTitle']
                    if role not in ("Vice Chair", "Chairperson"):
                        role = 'Member'

                    person = office['OfficeRecordFullName'].strip()
                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(body['BodyName'],
                                     role=role,
                                     start_date=self.toDate(office['OfficeRecordStartDate']),
                                     end_date=self.toDate(office['OfficeRecordEndDate']))

                yield o

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Boards or Commission']:
                o = Organization(body['BodyName'],
                                 classification='commission',
                                 parent_id={'name': 'Sacramento City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body),
                             note='web')

                yield o

        for p in members.values():
            yield p
예제 #14
0
    def scrape(self):
        body_types = self.body_types()

        board_of_directors, = [
            body for body in self.bodies()
            if body['BodyName'] == 'Board of Directors - Regular Board Meeting'
        ]
        board_of_directors["BodyName"] = "Board of Directors"

        terms = collections.defaultdict(list)
        for office in self.body_offices(board_of_directors):
            terms[office['OfficeRecordFullName']].append(office)

        members = {}
        for member, offices in terms.items():
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']

                if role not in {'Board Member', 'non-voting member'}:
                    p.add_term(
                        role,
                        'legislature',
                        start_date=self.toDate(term['OfficeRecordStartDate']),
                        end_date=self.toDate(term['OfficeRecordEndDate']),
                        appointment=True)
                if role != 'Chief Executive Officer':
                    if role == 'non-voting member':
                        member_type = 'Nonvoting Board Member'
                        post = NONVOTING_POSTS.get(member)
                    else:
                        member_type = 'Board Member'
                        post = VOTING_POSTS.get(member)

                    p.add_term(
                        member_type,
                        'legislature',
                        district=post,
                        start_date=self.toDate(term['OfficeRecordStartDate']),
                        end_date=self.toDate(term['OfficeRecordEndDate']))

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                org_name = body['BodyName'].strip()
                o = Organization(org_name,
                                 classification='committee',
                                 parent_id={'name': 'Board of Directors'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                for office in self.body_offices(body):
                    role = office['OfficeRecordTitle']

                    if role not in ("Chair", "Vice Chair"):
                        if role == 'non-voting member':
                            role = 'Nonvoting Member'
                        else:
                            role = 'Member'

                    person = office['OfficeRecordFullName']

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(org_name,
                                     role=role,
                                     start_date=self.toDate(
                                         office['OfficeRecordStartDate']),
                                     end_date=self.toDate(
                                         office['OfficeRecordEndDate']))

                yield o

        for p in members.values():
            yield p
예제 #15
0
    def scrape(self):
        body_types = self.body_types()

        city_council, = [
            body for body in self.bodies()
            if body['BodyName'] == 'City Council'
        ]

        terms = collections.defaultdict(list)
        for office in self.body_offices(city_council):
            if 'VACAN' not in office['OfficeRecordFullName']:
                terms[office['OfficeRecordFullName'].strip()].append(office)

        web_scraper = LegistarPersonScraper(None, None)
        web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881'
        web_scraper.ALL_MEMBERS = '3:3'

        web_info = {}
        for member, _ in web_scraper.councilMembers(
            {'ctl00$ContentPlaceHolder$lstName': 'City Council'}):
            web_info[member['Person Name']['label']] = member

        web_info['Balcer, James'] = collections.defaultdict(lambda: None)
        web_info['Fioretti, Bob'] = collections.defaultdict(lambda: None)
        web_info['Balcer, James']['Ward/Office'] = 11
        web_info['Fioretti, Bob']['Ward/Office'] = 2

        members = {}
        for member, offices in terms.items():
            web = web_info[member]
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']
                p.add_term('Alderman',
                           'legislature',
                           district="Ward {}".format(int(web['Ward/Office'])),
                           start_date=self.toDate(
                               term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

            if web.get('Photo'):
                p.image = web['Photo']

            contact_types = {
                "City Hall Address": ("address", "City Hall Address"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "Ward Office Phone": ("voice", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contact_types.items():
                if web[contact_type] and web[contact_type] != 'N/A':
                    p.add_contact_detail(type=type_,
                                         value=web[contact_type],
                                         note=_note)

            if web["E-mail"] and web["E-mail"][
                    "label"] and web["E-mail"]["label"] != 'N/A':
                p.add_contact_detail(type="email",
                                     value=web['E-mail']['label'],
                                     note='E-mail')

            if web['Website']:
                p.add_link(web['Website']['url'])

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name': 'Chicago City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                for office in self.body_offices(body):
                    # messed up record for joanna thompson
                    if office['OfficeRecordId'] == 1055:
                        continue

                    role = office['OfficeRecordTitle']
                    if role not in ("Vice Chair", "Chairman"):
                        role = 'Member'

                    person = office['OfficeRecordFullName'].strip()
                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(body['BodyName'],
                                     role=role,
                                     start_date=self.toDate(
                                         office['OfficeRecordStartDate']),
                                     end_date=self.toDate(
                                         office['OfficeRecordEndDate']))

                yield o

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Joint Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name': 'Chicago City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                yield o

        for p in members.values():
            yield p
    def scrape_chamber(self, chamber):
        if chamber == 'lower':
            url = 'http://www.scstatehouse.gov/member.php?chamber=H'
        else:
            url = 'http://www.scstatehouse.gov/member.php?chamber=S'

        seen_committees = {}

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        for a in doc.xpath('//a[@class="membername"]'):
            full_name = a.text
            leg_url = a.get('href')

            if full_name.startswith('Senator'):
                full_name = full_name.replace('Senator ', '')
            if full_name.startswith('Representative'):
                full_name = full_name.replace('Representative ', '')

            leg_html = self.get(leg_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            if 'Resigned effective' in leg_html:
                self.info('Resigned')
                continue

            party, district, _ = leg_doc.xpath('//p[@style="font-size: 17px;'
                                               ' margin: 0 0 0 0; padding: 0;"]/text()')

            if 'Republican' in party:
                party = 'Republican'
            elif 'Democrat' in party:
                party = 'Democratic'

            # District # - County - Map
            district = district.split()[1]
            try:
                photo_url = leg_doc.xpath('//img[contains(@src,"/members/")]/@src')[0]
            except IndexError:
                self.warning("No Photo URL for {}".format(full_name))
                photo_url = ''
            person = Person(name=full_name, district=district,
                            party=party, primary_org=chamber,
                            image=photo_url)

            # office address / phone
            try:
                addr_div = leg_doc.xpath('//div[@style="float: left; width: 225px;'
                                         ' margin: 10px 5px 0 20px; padding: 0;"]')[0]
                capitol_address = addr_div.xpath('p[@style="font-size: 13px;'
                                                 ' margin: 0 0 10px 0; padding: 0;"]'
                                                 )[0].text_content()

                phone = addr_div.xpath('p[@style="font-size: 13px;'
                                       ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                capitol_phone = phone.strip()

                if capitol_address:
                    person.add_contact_detail(type='address', value=capitol_address,
                                              note='Capitol Office')

                if capitol_phone:
                    person.add_contact_detail(type='voice', value=capitol_phone,
                                              note='Capitol Office')
            except IndexError:
                self.warning('no capitol address for {0}'.format(full_name))

            # home address / phone
            try:
                addr_div = leg_doc.xpath('//div[@style="float: left;'
                                         ' width: 225px; margin: 10px 0 0 20px;"]')[0]
                addr = addr_div.xpath('p[@style="font-size: 13px;'
                                      ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content()

                phone = addr_div.xpath('p[@style="font-size: 13px;'
                                       ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                phone = phone.strip()
                if addr:
                    person.add_contact_detail(type='address', value=addr,
                                              note='District Office')

                if phone:
                    person.add_contact_detail(type='voice', value=phone,
                                              note='District Office')
            except IndexError:
                self.warning('no district address for {0}'.format(full_name))

            person.add_link(leg_url)
            person.add_source(url)
            person.add_source(leg_url)

            # committees (skip first link)
            for com in leg_doc.xpath('//a[contains(@href, "committee.php")]')[1:]:
                if com.text.endswith(', '):
                    committee, role = com.text_content().rsplit(', ', 1)

                    # known roles
                    role = {'Treas.': 'treasurer',
                            'Secy.': 'secretary',
                            'Secy./Treas.': 'secretary/treasurer',
                            'V.C.': 'vice-chair',
                            '1st V.C.': 'first vice-chair',
                            'Co 1st V.C.': 'co-first vice-chair',
                            '2nd V.C.': 'second vice-chair',
                            '3rd V.C.': 'third vice-chair',
                            'Ex.Officio Member': 'ex-officio member',
                            'Chairman': 'chairman'}[role]
                else:
                    committee = com.text
                    role = 'member'

                # only yield each committee once
                if committee not in seen_committees:
                    com = Organization(name=committee, classification='committee',
                                       chamber=chamber)
                    com.add_source(url)
                    seen_committees[committee] = com
                    yield com
                else:
                    com = seen_committees[committee]

                person.add_membership(com, role=role)

            yield person
예제 #17
0
    def scrape(self):
        '''
        Scrape the web to create a dict with all active organizations.
        Then, we can access the correct URL for the organization detail page.
        '''
        web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx'
        web_info = {}

        for _, organizations in web_scraper.councilMembers():
            for organization, _, _ in organizations:
                organization_name = organization['Department Name']['label'].strip()
                organization_info = organization['Department Name']

                web_info[organization_name] = organization_info

        body_types = self.body_types()

        board_of_directors, = [body for body in self.bodies()
                               if body['BodyName'] == 'Board of Directors - Regular Board Meeting']
        board_of_directors["BodyName"] = "Board of Directors"

        terms = collections.defaultdict(list)
        for office in self.body_offices(board_of_directors):
            terms[office['OfficeRecordFullName']].append(office)

        members = {}
        for member, offices in terms.items():
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']

                if role not in {'Board Member', 'non-voting member'}:
                    p.add_term(role,
                               'legislature',
                               start_date = self.toDate(term['OfficeRecordStartDate']),
                               end_date = self.toDate(term['OfficeRecordEndDate']),
                               appointment = True)
                if role != 'Chief Executive Officer':
                    if role == 'non-voting member':
                        member_type = 'Nonvoting Board Member'
                        post = NONVOTING_POSTS.get(member)
                    else:
                        member_type = 'Board Member'
                        post = VOTING_POSTS.get(member)

                    start_date = self.toDate(term['OfficeRecordStartDate'])
                    end_date = self.toDate(term['OfficeRecordEndDate'])
                    board_membership = p.add_term(member_type,
                               'legislature',
                               district = post,
                               start_date = start_date,
                               end_date = end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(p.name)
                    if acting_member_end_date and acting_member_end_date <= end_date:
                        board_membership.extras = {'acting': 'true'}

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                organization_name = body['BodyName'].strip()
                o = Organization(organization_name,
                                 classification='committee',
                                 parent_id={'name' : 'Board of Directors'})

                organization_info = web_info.get(organization_name, {})
                organization_url = organization_info.get('url', self.WEB_URL + 'https://metro.legistar.com/Departments.aspx')

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(organization_url, note='web')

                for office in self.body_offices(body):
                    role = office['OfficeRecordTitle']


                    if role not in ("Chair", "Vice Chair", "Chief Executive Officer"):
                        if role == 'non-voting member':
                            role = 'Nonvoting Member'
                        else:
                            role = 'Member'

                    person = office['OfficeRecordFullName']

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    start_date = self.toDate(office['OfficeRecordStartDate'])
                    end_date = self.toDate(office['OfficeRecordEndDate'])
                    membership = p.add_membership(organization_name,
                                     role=role,
                                     start_date=start_date,
                                     end_date=end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(p.name)
                    if acting_member_end_date and acting_member_end_date <= end_date:
                        membership.extras = {'acting': 'true'}

                yield o

        for p in members.values():
            yield p
예제 #18
0
    def scrape(self):
        body_types = self.body_types()

        city_council, = [body for body in self.bodies()
                         if body['BodyName'] == 'City Council']

        terms = collections.defaultdict(list)
        for office in self.body_offices(city_council):
            if 'vacan' not in office['OfficeRecordFullName'].lower():
                terms[office['OfficeRecordFullName'].strip()].append(office)

        web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute)
        web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881'
        web_scraper.ALL_MEMBERS = '3:3'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False


        web_info = {}
        for member, _ in web_scraper.councilMembers({'ctl00$ContentPlaceHolder$lstName' : 'City Council'}):
            web_info[member['Person Name']['label']] = member


        web_info['Balcer, James'] = collections.defaultdict(lambda : None)
        web_info['Fioretti, Bob'] = collections.defaultdict(lambda : None)
        web_info['Balcer, James']['Ward/Office'] = 11
        web_info['Fioretti, Bob']['Ward/Office'] = 2
        
        members = {}
        for member, offices in terms.items():
            web = web_info[member]
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']
                p.add_term('Alderman',
                           'legislature',
                           district = "Ward {}".format(int(web['Ward/Office'])),
                           start_date = self.toDate(term['OfficeRecordStartDate']),
                           end_date = self.toDate(term['OfficeRecordEndDate']))

            if web.get('Photo'):
                p.image = web['Photo']

            contact_types = {
                "City Hall Address": ("address", "City Hall Address"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "Ward Office Phone": ("voice", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contact_types.items():
                if web[contact_type] and web[contact_type] != 'N/A':
                    p.add_contact_detail(type=type_,
                                         value= web[contact_type],
                                         note=_note)

            if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != 'N/A':
                p.add_contact_detail(type="email",
                                     value=web['E-mail']['label'],
                                     note='E-mail')


            if web['Website']:
                p.add_link(web['Website']['url'])

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')


            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name' : 'Chicago City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web')

                for office in self.body_offices(body):
                    # messed up record for joanna thompson
                    if office['OfficeRecordId'] == 1055:
                        continue
                        
                    role = office['OfficeRecordTitle']
                    if role not in ("Vice Chair", "Chairman"):
                        role = 'Member'

                    person = office['OfficeRecordFullName'].strip()
                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)
                        
                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    try:
                        end_date = self.toDate(office['OfficeRecordEndDate'])
                    except TypeError:
                        end_date = ''
                    p.add_membership(body['BodyName'],
                                     role=role,
                                     start_date=self.toDate(office['OfficeRecordStartDate']),
                                     end_date=end_date)

                yield o

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Joint Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name' : 'Chicago City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web')

                yield o        

        for p in members.values():
            yield p
예제 #19
0
    def scrape(self):
        body_types = self.body_types()

        city_council, = [
            body for body in self.bodies()
            if body['BodyName'] == 'City Council '
        ]

        terms = collections.defaultdict(list)

        for office in self.body_offices(city_council):

            if office['OfficeRecordFullName'] != "Granicus BA":
                terms[office['OfficeRecordFullName']].append(office)

        members = {}

        for member, offices in terms.items():

            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']
                p.add_term(
                    role,
                    'legislature',
                    # district = "District {}".format(int(web['District/Office'])),
                    start_date=self.toDate(term['OfficeRecordStartDate']),
                    end_date=self.toDate(term['OfficeRecordEndDate']))

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Standing Committees']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name': 'Sacramento City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                for office in self.body_offices(body):
                    # messed up record for joanna thompson
                    if office['OfficeRecordId'] == 1055:
                        continue

                    role = office['OfficeRecordTitle']
                    if role not in ("Vice Chair", "Chairperson"):
                        role = 'Member'

                    person = office['OfficeRecordFullName'].strip()
                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(body['BodyName'],
                                     role=role,
                                     start_date=self.toDate(
                                         office['OfficeRecordStartDate']),
                                     end_date=self.toDate(
                                         office['OfficeRecordEndDate']))

                yield o

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Boards or Commission']:
                o = Organization(body['BodyName'],
                                 classification='commission',
                                 parent_id={'name': 'Sacramento City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                yield o

        for p in members.values():
            yield p
예제 #20
0
    def scrape(self):
        body_types = self.body_types()

        board_of_directors, = [
            body for body in self.bodies()
            if body['BodyName'] == 'Board of Directors'
        ]

        members = {}
        for office in self.body_offices(board_of_directors):
            members.setdefault(office['OfficeRecordFullName'],
                               []).append(office)

        for member, offices in members.items():
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']

                if role != 'non-voting member':
                    role = 'Board Member'
                    post = VOTING_POSTS.get(member)
                else:
                    role = 'Nonvoting Board Member'
                    post = NONVOTING_POSTS.get(member)

                p.add_term(role,
                           'legislature',
                           district=post,
                           start_date=self.toDate(
                               office['OfficeRecordStartDate']),
                           end_date=self.toDate(office['OfficeRecordEndDate']))

            legistar_api = self.BASE_URL + '/OfficeRecords/'

            p.add_source(legistar_api, note='api')
            print(p)

            yield p

        adjunct_members = {}

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name': 'Board of Directors'})

                o.add_source(self.BASE_URL + '/Bodies/')

                for office in self.body_offices(body):
                    role = office['OfficeRecordTitle']
                    if role not in ("Chair", "Vice Chair"):
                        role = 'Member'

                    person = office['OfficeRecordFullName']
                    if person not in members:
                        if person not in adjunct_members:
                            p = Person(person)
                            p.add_source('foo')

                        else:
                            p = adjunct_members[person]

                        p.add_membership(body['BodyName'],
                                         role=role,
                                         start_date=self.toDate(
                                             office['OfficeRecordStartDate']),
                                         end_date=self.toDate(
                                             office['OfficeRecordEndDate']))
                        adjunct_members[person] = p
                    else:
                        o.add_member(office['OfficeRecordFullName'],
                                     role,
                                     start_date=self.toDate(
                                         office['OfficeRecordStartDate']),
                                     end_date=self.toDate(
                                         office['OfficeRecordEndDate']))

                yield o

        for p in adjunct_members.values():
            yield p
예제 #21
0
    def scrape(self):
        # chambers = [chamber] if chamber is not None else ['upper', 'lower']
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        page = self.get(leg_url)

        committees = {}

        # Ensure that the spreadsheet's structure hasn't generally changed
        _row_headers = page.text.split("\r\n")[0].replace('"', "").split(",")
        assert _row_headers == HEADERS, "Spreadsheet structure may have changed"

        page = open_csv(page)
        for row in page:

            chamber = {"H": "lower", "S": "upper"}[row["office code"]]

            district = row["dist"].lstrip("0")
            assert district.isdigit(), "Invalid district found: {}".format(district)

            name = row["first name"]
            mid = row["middle initial"].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row["last name"]
            suffix = row["suffix"].strip()
            if suffix:
                name += " %s" % suffix

            party = row["party"]
            if party == "Democrat":
                party = "Democratic"

            leg = Person(primary_org=chamber, name=name, district=district, party=party)

            legislator_url = row["URL"].replace("\\", "//").strip()
            if legislator_url != "":
                if not legislator_url.startswith("http"):
                    legislator_url = "http://"
                leg.add_link(legislator_url)

            leg.add_party(party=party)

            office_address = "%s\nRoom %s\nHartford, CT 06106" % (
                row["capitol street address"],
                row["room number"],
            )
            # extra_office_fields = dict()
            email = row["email"].strip()
            if "@" not in email:
                if not email:
                    email = None
                elif email.startswith("http://") or email.startswith("https://"):
                    # extra_office_fields['contact_form'] = email
                    email = None
                else:
                    raise ValueError("Problematic email found: {}".format(email))
            leg.add_contact_detail(
                type="address", value=office_address, note="Capitol Office"
            )
            leg.add_contact_detail(
                type="voice", value=row["capitol phone"], note="Capitol Office"
            )
            if email:
                leg.add_contact_detail(type="email", value=email)

            home_address = "{}\n{}, {} {}".format(
                row["home street address"],
                row["home city"],
                row["home state"],
                row["home zip code"],
            )
            if "Legislative Office Building" not in home_address:
                leg.add_contact_detail(
                    type="address", value=home_address, note="District Office"
                )
                if row["home phone"].strip():
                    leg.add_contact_detail(
                        type="voice", value=row["home phone"], note="District Office"
                    )
            leg.add_source(leg_url)

            for comm_name in row["committee member1"].split(";"):
                if " (" in comm_name:
                    comm_name, role = comm_name.split(" (")
                    role = role.strip(")").lower()
                else:
                    role = "member"
                comm_name = comm_name.strip()
                if comm_name:
                    if comm_name in committees:
                        com = committees[comm_name]
                    else:
                        com = Organization(
                            comm_name, classification="committee", chamber=chamber
                        )
                        com.add_source(leg_url)
                        committees[comm_name] = com
                        yield com

                    leg.add_membership(name_or_org=com, role=role)

            yield leg
예제 #22
0
    def scrape(self):
        web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute)
        web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}

        for member, _ in web_scraper.councilMembers():
            name = member['Person Name']['label'].strip()
            web_info[name] = member

        city_council, = [body for body in self.bodies()
                         if body['BodyName'] == 'City Council']

        terms = collections.defaultdict(list)

        public_advocates = {  # Match casing to Bill De Blasio as council member
            'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio',
            'The Public Advocate (Ms. James)': 'Letitia James',
        }

        for office in self.body_offices(city_council):
            name = office['OfficeRecordFullName']
            name = public_advocates.get(name, name).strip()

            terms[name].append(office)

            # Add past members (and advocates public)
            if name not in web_info:
                web_info[name] = collections.defaultdict(lambda: None)

        # Check that we have everyone we expect, formatted consistently, in
        # both information arrays. For instance, this will fail if we forget to
        # strip trailing spaces from names on one side or the other (which has
        # the effect of omitting information, such as post, from the scrape).

        assert set(web_info.keys()) == set(terms.keys())

        members = {}

        for member, offices in terms.items():

            p = Person(member)

            web = web_info[member]

            for term in offices:
                role = term['OfficeRecordTitle']

                if role == 'Public Advocate':
                    role = 'Non-Voting Council Member'
                else:
                    role = 'Council Member'

                district = web.get('District', '').replace(' 0', ' ')

                p.add_term(role,
                           'legislature',
                           district=district,
                           start_date=self.toDate(term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

                party = web.get('Political Party')

                if party == 'Democrat':
                    party = 'Democratic'

                if party:
                    p.add_party(party)

                if web.get('Photo'):
                    p.image = web['Photo']

                contact_types = {
                    "City Hall Office": ("address", "City Hall Office"),
                    "City Hall Phone": ("voice", "City Hall Phone"),
                    "Ward Office Phone": ("voice", "Ward Office Phone"),
                    "Ward Office Address": ("address", "Ward Office Address"),
                    "Fax": ("fax", "Fax")
                }

                for contact_type, (type_, _note) in contact_types.items():
                    if web.get(contact_type) and web(contact_type) != 'N/A':
                        p.add_contact_detail(type=type_,
                                             value= web[contact_type],
                                             note=_note)

                if web.get('E-mail'):
                    p.add_contact_detail(type="email",
                                         value=web['E-mail']['url'],
                                         note='E-mail')

                if web.get('Web site'):
                    p.add_link(web['Web site']['url'], note='web site')

                if web.get('Notes'):
                    p.extras = {'Notes': web['Notes']}

                if not p.sources:  # Only add sources once
                    source_urls = self.person_sources_from_office(term)
                    person_api_url, person_web_url = source_urls
                    p.add_source(person_api_url, note='api')
                    p.add_source(person_web_url, note='web')

            members[member] = p

        committee_types = ['Committee',
                           'Inactive Committee',
                           'Select Committee',
                           'Subcommittee',
                           'Task Force',
                           'Land Use', # Committee on Land Use
                          ]

        body_types = {k: v for k, v in self.body_types().items()
                      if k in committee_types}

        for body in self.bodies():
            if body['BodyTypeName'] in body_types \
                or body['BodyName'] in ('Legislative Documents Unit',
                                        'Legal and Government Affairs Division'):

                # Skip typo in API data
                if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services':
                    continue

                parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council')

                body_name = body['BodyName']

                o = Organization(body_name,
                                 classification='committee',
                                 parent_id={'name': parent_org})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web')

                for office in self.body_offices(body):
                    # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio',
                    # 'Committee Member', None, 'CHAIRPERSON'

                    role = office['OfficeRecordTitle']

                    if role and role.lower() == 'chairperson':
                        role = 'Chairperson'
                    else:
                        role = 'Member'

                    person = office['OfficeRecordFullName']
                    person = public_advocates.get(person, person).strip()

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(o,
                                     role=role,
                                     start_date=self.toDate(office['OfficeRecordStartDate']),
                                     end_date=self.toDate(office['OfficeRecordEndDate']))

                yield o

        for p in members.values():
            yield p
예제 #23
0
    def scrape(self):
        urls = Urls(dict(list=legislators_url), self)

        council = Organization('Denver City Council')
        council.add_source(legislators_url)

        # Get image urls, names, detail urls, and districts.
        image_xpath = '//a[contains(@href, "councildistrict")]/img/@src'
        image_urls = urls.list.xpath(image_xpath)

        name_xpath = '//a[contains(@href, "councildistrict")]'
        names = [a.text_content() for a in urls.list.xpath(name_xpath)][:-1]
        names = filter(None, names)

        person_urls_xpath = '//a[contains(@href, "councildistrict")]/@href'
        person_urls = urls.list.xpath(person_urls_xpath)

        post_ids = []
        xpath = '//a[contains(@href, "councildistrict")]/img/ancestor::td'
        for td in urls.list.xpath(xpath):
            text = td.text_content()
            m = re.search('Council District \d+', text)
            if m:
                post_ids.append(m.group())
                continue
            m = re.search('Council At-Large', text)
            if m:
                post_ids.append('Council At-Large')

        for post_id in post_ids:
            council.add_post(post_id, post_id)
        yield council

        data = zip(image_urls, names, person_urls, post_ids)
        for image_url, name, person_url, post_id in data:

            # Create legislator.
            person = Person(name, image=image_url)

            # Add sources.
            urls.add(detail=person_url)
            person.add_source(urls.list.url, note='list')
            person.add_source(urls.detail.url, note='detail')

            # Add membership on council.
            memb = person.add_membership(council, district=post_id.strip())
            memb.add_source(urls.detail.url)

            xpath = '//div[@id="dnn_column3"]'
            contact_text = urls.detail.xpath(xpath)[0].text_content()

            if not contact_text.strip():
                xpath = '//div[contains(@id, "dnn_RightPaneWide")]'
                contact_text = urls.detail.xpath(xpath)[0].text_content()

            phone_regex = r'\(\d{3}\)[ -]*\d{3}-\d{4}'
            phone = re.search(phone_regex, contact_text).group()
            memb.contact_details.append(
                dict(type='phone', value=phone, note='work'))

            # Add email address.
            email_regex = r'\[email protected]'
            email = re.search(email_regex, contact_text).group()
            memb.contact_details.append(
                dict(type='email', value=email, note='work'))

            yield person
예제 #24
0
    def scrape(self):
        # chambers = [chamber] if chamber is not None else ['upper', 'lower']
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        page = self.get(leg_url)

        committees = {}

        # Ensure that the spreadsheet's structure hasn't generally changed
        _row_headers = page.text.split('\r\n')[0].replace('"', '').split(',')
        assert _row_headers == HEADERS, "Spreadsheet structure may have changed"

        page = open_csv(page)
        for row in page:

            chamber = {'H': 'lower', 'S': 'upper'}[row['office code']]

            district = row['dist'].lstrip('0')
            assert district.isdigit(), "Invalid district found: {}".format(district)

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            leg = Person(primary_org=chamber,
                         name=name,
                         district=district,
                         party=party
                         )

            legislator_url = row['URL'].replace('\\', '//').strip()
            if legislator_url != '':
                if not legislator_url.startswith('http'):
                    legislator_url = 'http://'
                leg.add_link(legislator_url)

            leg.add_party(party=party)

            office_address = "%s\nRoom %s\nHartford, CT 06106" % (
                row['capitol street address'], row['room number'])
            # extra_office_fields = dict()
            email = row['email'].strip()
            if "@" not in email:
                if not email:
                    email = None
                elif email.startswith('http://') or email.startswith('https://'):
                    # extra_office_fields['contact_form'] = email
                    email = None
                else:
                    raise ValueError("Problematic email found: {}".format(email))
            leg.add_contact_detail(type='address', value=office_address, note='Capitol Office')
            leg.add_contact_detail(type='voice', value=row['capitol phone'], note='Capitol Office')
            if email:
                leg.add_contact_detail(type='email', value=email)

            home_address = "{}\n{}, {} {}".format(
                row['home street address'],
                row['home city'],
                row['home state'],
                row['home zip code'],
            )
            if "Legislative Office Building" not in home_address:
                leg.add_contact_detail(type='address', value=home_address, note='District Office')
                if row['home phone'].strip():
                    leg.add_contact_detail(type='voice',
                                           value=row['home phone'],
                                           note='District Office')
            leg.add_source(leg_url)

            for comm_name in row['committee member1'].split(';'):
                if ' (' in comm_name:
                    comm_name, role = comm_name.split(' (')
                    role = role.strip(')').lower()
                else:
                    role = 'member'
                comm_name = comm_name.strip()
                if comm_name:
                    if comm_name in committees:
                        com = committees[comm_name]
                    else:
                        com = Organization(comm_name, classification='committee', chamber=chamber)
                        com.add_source(leg_url)
                        committees[comm_name] = com
                        yield com

                    leg.add_membership(name_or_org=com, role=role)

            yield leg
예제 #25
0
    def transform_parse(self, parsed_form, response):

        _source = {
            "url": response.url,
            "note": "LDA Form LD-1"
        }

        # basic disclosure fields
        _disclosure = Disclosure(
            effective_date=datetime.strptime(
                parsed_form['datetimes']['effective_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            timezone='America/New_York',
            submitted_date=datetime.strptime(
                parsed_form['datetimes']['signature_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            classification="lobbying"
        )

        _disclosure.add_authority(name=self.authority.name,
                                  type=self.authority._type,
                                  id=self.authority._id)

        _disclosure.add_identifier(
            identifier=parsed_form['_meta']['document_id'],
            scheme="urn:sopr:filing"
        )

        # disclosure extras
        _disclosure.extras = {}
        _disclosure.extras['registrant'] = {
            'self_employed_individual': parsed_form['registrant']['self_employed_individual'],
            'general_description': parsed_form['registrant']['registrant_general_description'],
            'signature': {
                "signature_date": parsed_form['datetimes']['signature_date'],
                "signature": parsed_form['signature']
            }
        }

        _disclosure.extras['client'] = {
            'same_as_registrant':
                parsed_form['client']['client_self'],
            'general_description':
                parsed_form['client']['client_general_description']
        }

        _disclosure.extras['registration_type'] = {
            'is_amendment':
                parsed_form['registration_type']['is_amendment'],
            'new_registrant':
                parsed_form['registration_type']['new_registrant'],
            'new_client_for_existing_registrant':
                parsed_form['registration_type'][
                    'new_client_for_existing_registrant'],
        }

        # # Registrant
        # build registrant
        _registrant_self_employment = None

        if parsed_form['registrant']['self_employed_individual']:
            n = ' '.join([p for p in [
                parsed_form['registrant']['registrant_individual_prefix'],
                parsed_form['registrant']['registrant_individual_firstname'],
                parsed_form['registrant']['registrant_individual_lastname']
            ] if len(p) > 0]).strip()

            _registrant = Person(
                name=n,
                source_identified=True
            )

            _registrant_self_employment = Organization(
                name='SELF-EMPLOYMENT of {n}'.format(n=n),
                classification='company',
                source_identified=True
            )

            _registrant.add_membership(
                organization=_registrant_self_employment,
                role='self_employed',
                label='self-employment of {n}'.format(n=n),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )
        else:
            _registrant = Organization(
                name=parsed_form['registrant']['registrant_org_name'],
                classification='company',
                source_identified=True
            )

        if len(parsed_form['registrant']['registrant_house_id']) > 0:
            _registrant.add_identifier(
                identifier=parsed_form['registrant']['registrant_house_id'],
                scheme='urn:house_clerk:registrant'
            )

        if len(parsed_form['registrant']['registrant_senate_id']) > 0:
            _registrant.add_identifier(
                identifier=parsed_form['registrant']['registrant_senate_id'],
                scheme='urn:sopr:registrant'
            )

        registrant_contact_details = [
            {
                "type": "address",
                "note": "contact address",
                "value": '; '.join([
                    p for p in [
                        parsed_form['registrant']['registrant_address_one'],
                        parsed_form['registrant']['registrant_address_two'],
                        parsed_form['registrant']['registrant_city'],
                        parsed_form['registrant']['registrant_state'],
                        parsed_form['registrant']['registrant_zip'],
                        parsed_form['registrant']['registrant_country']]
                    if len(p) > 0]).strip(),
            },
            {
                "type": "voice",
                "note": "contact phone",
                "value": parsed_form['registrant']['registrant_contact_phone'],
            },
            {
                "type": "email",
                "note": "contact email",
                "value": parsed_form['registrant']['registrant_contact_email'],
            },
        ]

        registrant_contact_ppb = {
            "type": "address",
            "note": "principal place of business",
            "value": '; '.join([
                p for p in [
                    parsed_form['registrant']['registrant_ppb_city'],
                    parsed_form['registrant']['registrant_ppb_state'],
                    parsed_form['registrant']['registrant_ppb_zip'],
                    parsed_form['registrant']['registrant_ppb_country']]
                if len(p) > 0]).strip(),
        }

        if registrant_contact_ppb["value"]:
            registrant_contact_details.append(registrant_contact_ppb)

        for cd in registrant_contact_details:
            _registrant.add_contact_detail(**cd)

        _registrant.extras = {
            "contact_details_structured": [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address_one",
                            "value": parsed_form['registrant'][
                                'registrant_address_one'],
                        },
                        {
                            "note": "address_two",
                            "value": parsed_form['registrant'][
                                'registrant_address_two'],
                        },
                        {
                            "note": "city",
                            "value": parsed_form['registrant'][
                                'registrant_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['registrant'][
                                'registrant_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['registrant'][
                                'registrant_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['registrant'][
                                'registrant_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_country'],
                        }
                    ],
                },
            ]
        }

        # # People
        # build contact
        _main_contact = Person(
            name=parsed_form['registrant']['registrant_contact_name'],
            source_identified=True
        )

        main_contact_contact_details = [
            {
                "type": "voice",
                "note": "contact phone",
                "value": parsed_form['registrant']['registrant_contact_phone'],
            },
            {
                "type": "email",
                "note": "contact email",
                "value": parsed_form['registrant']['registrant_contact_email'],
            }
        ]

        for cd in main_contact_contact_details:
            _main_contact.add_contact_detail(**cd)

        if _registrant._type == 'organization':
            _registrant.add_member(
                name_or_person=_main_contact,
                role='main_contact',
                label='main contact for {n}'.format(n=_registrant.name),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )
        else:
            _registrant_self_employment.add_member(
                name_or_person=_main_contact,
                role='main_contact',
                label='main contact for {n}'.format(n=_registrant.name),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )

        # # Client
        # build client
        _client = Organization(
            name=parsed_form['client']['client_name'],
            classification='company',
            source_identified=True
        )

        client_contact_details = [
            {
                "type": "address",
                "note": "contact address",
                "value": '; '.join([
                    p for p in [
                        parsed_form['client']['client_address'],
                        parsed_form['client']['client_city'],
                        parsed_form['client']['client_state'],
                        parsed_form['client']['client_zip'],
                        parsed_form['client']['client_country']]
                    if len(p) > 0]).strip(),
            },
        ]

        client_contact_ppb = {
            "type": "address",
            "note": "principal place of business",
            "value": '; '.join([
                p for p in [
                    parsed_form['client']['client_ppb_city'],
                    parsed_form['client']['client_ppb_state'],
                    parsed_form['client']['client_ppb_zip'],
                    parsed_form['client']['client_ppb_country']]
                if len(p) > 0]).strip(),
        }

        if client_contact_ppb["value"]:
            client_contact_details.append(client_contact_ppb)

        for cd in client_contact_details:
            _client.add_contact_detail(**cd)

        _client.extras = {
            "contact_details_structured": [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": parsed_form['client']['client_address'],
                        },
                        {
                            "note": "city",
                            "value": parsed_form['client']['client_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['client']['client_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['client']['client_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['client']['client_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value": parsed_form['client']['client_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['client']['client_ppb_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['client']['client_ppb_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['client'][
                                'client_ppb_country'],
                        }
                    ],
                },
            ],
        }

        # Collect Foreign Entities
        _foreign_entities = []
        _foreign_entities_by_name = {}
        for fe in parsed_form['foreign_entities']:
            fe_extras = {}
            fe_name = fe['foreign_entity_name']

            # check for name-based duplicates
            if fe_name in _foreign_entities_by_name:
                _foreign_entity = _foreign_entities_by_name[fe_name]
            else:
                _foreign_entity = Organization(
                    name=fe_name,
                    classification='company',
                    source_identified=True
                )

            # collect contact details
            foreign_entity_contact_details = [
                {
                    "type": "address",
                    "note": "contact address",
                    "value": '; '.join([
                        p for p in [
                            fe['foreign_entity_address'],
                            fe['foreign_entity_city'],
                            fe['foreign_entity_state'],
                            fe['foreign_entity_country']]
                        if len(p) > 0]).strip(),
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "value": '; '.join([
                        p for p in [
                            fe['foreign_entity_ppb_state'],
                            fe['foreign_entity_ppb_country']]
                        if len(p) > 0]).strip(),
                },
            ]

            foreign_entity_contact_ppb = {
                "type": "address",
                "note": "principal place of business",
                "value": '; '.join([
                    p for p in [
                        fe['foreign_entity_ppb_city'],
                        fe['foreign_entity_ppb_state'],
                        fe['foreign_entity_ppb_country']]
                    if len(p) > 0]),
            }

            if foreign_entity_contact_ppb["value"]:
                foreign_entity_contact_details.append(
                    foreign_entity_contact_ppb)

            # add contact details
            for cd in foreign_entity_contact_details:
                if cd['value'] != '':
                    _foreign_entity.add_contact_detail(**cd)

            # add extras
            fe_extras["contact_details_structured"] = [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": fe['foreign_entity_address'],
                        },
                        {
                            "note": "city",
                            "value": fe['foreign_entity_city'],
                        },
                        {
                            "note": "state",
                            "value": fe['foreign_entity_state'],
                        },
                        {
                            "note": "country",
                            "value": fe['foreign_entity_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "state",
                            "value": fe['foreign_entity_ppb_state'],
                        },
                        {
                            "note": "country",
                            "value": fe['foreign_entity_ppb_country'],
                        }
                    ],
                },
            ]

            _foreign_entity.extras = combine_dicts(_foreign_entity.extras,
                                                   fe_extras)

            _foreign_entities_by_name[fe_name] = _foreign_entity

        for unique_foreign_entity in _foreign_entities_by_name.values():
            _foreign_entities.append(unique_foreign_entity)

            # TODO: add a variant on memberships to represent inter-org
            # relationships (associations, ownership, etc)
            #
            # _client['memberships'].append({
            #     "id": _foreign_entity['id'],
            #     "classification": "organization",
            #     "name": _foreign_entity['name'],
            #     "extras": {
            #         "ownership_percentage":
            #             fe['foreign_entity_amount']
            #     }
            # })

        # Collect Lobbyists
        # TODO: deal with wierd non-name line continuation cases (blanks, "continued")
        _lobbyists_by_name = {}

        for l in parsed_form['lobbyists']:
            l_extras = {}
            l_name = ' '.join([l['lobbyist_first_name'],
                               l['lobbyist_last_name'],
                               l['lobbyist_suffix']
                               ]).strip()

            if l_name in _lobbyists_by_name:
                _lobbyist = _lobbyists_by_name[l_name]
            else:
                _lobbyist = Person(
                    name=l_name,
                    source_identified=True
                )

            if l['lobbyist_covered_official_position']:
                l_extras['lda_covered_official_positions'] = [
                    {
                        'date_reported':
                            parsed_form['datetimes']['effective_date'],
                        'covered_official_position':
                            l['lobbyist_covered_official_position']
                    },
                ]

            _lobbyist.extras = combine_dicts(_lobbyist.extras, l_extras)

            _lobbyists_by_name[l_name] = _lobbyist

        _lobbyists = []
        for unique_lobbyist in _lobbyists_by_name.values():
            _lobbyists.append(unique_lobbyist)

        if _registrant._type == 'organization':
            for l in _lobbyists:
                _registrant.add_member(
                    l,
                    role='lobbyist',
                    label='lobbyist for {n}'.format(n=_registrant.name),
                    start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
                )
        else:
            for l in _lobbyists:
                _registrant_self_employment.add_member(
                    l,
                    role='lobbyist',
                    label='lobbyist for {n}'.format(n=_registrant.name),
                    start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
                )

        # # Document
        # build document
        _disclosure.add_document(
            note='submitted filing',
            date=parsed_form['datetimes']['effective_date'][:10],
            url=response.url
        )

        # Collect Affiliated orgs
        _affiliated_organizations = []
        _affiliated_organizations_by_name = {}
        for ao in parsed_form['affiliated_organizations']:
            ao_extras = {}
            ao_name = ao['affiliated_organization_name']
            if ao_name in _affiliated_organizations_by_name:
                # There's already one by this name
                _affiliated_organization = _affiliated_organizations_by_name[ao_name]
            else:
                # New affiliated org
                _affiliated_organization = Organization(
                    name=ao_name,
                    classification='company',
                    source_identified=True
                )

            # collect contact details
            affiliated_organization_contact_details = [
                {
                    "type": "address",
                    "note": "contact address",
                    "value": '; '.join([
                        p for p in [
                            ao['affiliated_organization_address'],
                            ao['affiliated_organization_city'],
                            ao['affiliated_organization_state'],
                            ao['affiliated_organization_zip'],
                            ao['affiliated_organization_country']]
                        if len(p) > 0]).strip(),
                },
            ]

            affiliated_organization_contact_ppb = {
                "type": "address",
                "note": "principal place of business",
                "value": '; '.join([
                    p for p in [
                        ao['affiliated_organization_ppb_city'],
                        ao['affiliated_organization_ppb_state'],
                        ao['affiliated_organization_ppb_country']]
                    if len(p) > 0]).strip(),
            }

            if affiliated_organization_contact_ppb["value"]:
                affiliated_organization_contact_details.append(
                    affiliated_organization_contact_ppb)

            # add contact details
            for cd in affiliated_organization_contact_details:
                _affiliated_organization.add_contact_detail(**cd)

            ao_extras["contact_details_structured"] = [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": ao['affiliated_organization_address'],
                        },
                        {
                            "note": "city",
                            "value": ao['affiliated_organization_city'],
                        },
                        {
                            "note": "state",
                            "value": ao['affiliated_organization_state'],
                        },
                        {
                            "note": "zip",
                            "value": ao['affiliated_organization_zip'],
                        },
                        {
                            "note": "country",
                            "value": ao['affiliated_organization_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value":
                                ao['affiliated_organization_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value":
                                ao['affiliated_organization_ppb_state'],
                        },
                        {
                            "note": "country",
                            "value":
                                ao['affiliated_organization_ppb_country'],
                        }
                    ],
                },
            ],

            _affiliated_organization.extras = combine_dicts(
                _affiliated_organization.extras, ao_extras)

        for unique_affiliated_organization in _affiliated_organizations_by_name.values():
            _affiliated_organizations.append(unique_affiliated_organization)

        # # Events & Agendas
        # name
        if parsed_form['registration_type']['new_registrant']:
            registration_type = 'New Client, New Registrant'
        elif parsed_form['registration_type']['is_amendment']:
            registration_type = 'Amended Registration'
        else:
            registration_type = 'New Client for Existing Registrant'

        # Create registration event
        _event = Event(
            name="{rn} - {rt}, {cn}".format(rn=_registrant.name,
                                            rt=registration_type,
                                            cn=_client.name),
            timezone='America/New_York',
            location='United States',
            start_time=datetime.strptime(
                parsed_form['datetimes']['effective_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            classification='registration'
        )

        # add participants
        _event.add_participant(type=_registrant._type,
                               id=_registrant._id,
                               name=_registrant.name,
                               note="registrant")

        if _registrant._type == 'person':
            _event.add_participant(type=_registrant._type,
                                   id=_registrant._id,
                                   name=_registrant.name,
                                   note="registrant")

        _event.add_participant(type=_client._type,
                               id=_client._id,
                               name=_client.name,
                               note="client")

        for l in _lobbyists:
            _event.add_participant(type=l._type,
                                   id=l._id,
                                   name=l.name,
                                   note='lobbyist')

        for fe in _foreign_entities:
            _event.add_participant(type=fe._type,
                                   id=fe._id,
                                   name=fe.name,
                                   note='foreign_entity')

        for ao in _affiliated_organizations:
            _event.add_participant(type=ao._type,
                                   id=ao._id,
                                   name=ao.name,
                                   note='affiliated_organization')

        # add agenda item
        _agenda = _event.add_agenda_item(
            description='issues lobbied on',
        )

        _agenda['notes'].append(
            parsed_form['lobbying_issues_detail']
        )

        for li in parsed_form['lobbying_issues']:
            if li['general_issue_area'] != '':
                _agenda.add_subject(li['general_issue_area'])

        _disclosure.add_disclosed_event(
            name=_event.name,
            type=_event._type,
            classification=_event.classification,
            id=_event._id
        )

        # add registrant to disclosure's _related and related_entities fields
        _disclosure.add_registrant(name=_registrant.name,
                                   type=_registrant._type,
                                   id=_registrant._id)

        _registrant.add_source(
            url=_source['url'],
            note='registrant'
        )
        yield _registrant

        if _registrant_self_employment is not None:
            _registrant_self_employment.add_source(
                url=_source['url'],
                note='registrant_self_employment'
            )

            yield _registrant_self_employment

        _client.add_source(
            url=_source['url'],
            note='client'
        )
        yield _client

        _main_contact.add_source(
            url=_source['url'],
            note='main_contact'
        )
        yield _main_contact

        for ao in _affiliated_organizations:
            ao.add_source(
                url=_source['url'],
                note='affiliated_organization'
            )
            yield ao
        for fe in _foreign_entities:
            fe.add_source(
                url=_source['url'],
                note='foreign_entity'
            )
            yield fe
        for l in _lobbyists:
            l.add_source(
                url=_source['url'],
                note='lobbyist'
            )
            yield l

        _event.add_source(**_source)
        yield _event
        _disclosure.add_source(**_source)
        yield _disclosure
예제 #26
0
    def scrape_chamber(self, chamber):
        body = {'lower': 'H', 'upper': 'S'}[chamber]
        url = 'http://www.azleg.gov/MemberRoster/?body=' + body
        page = self.get(url).text

        # there is a bad comment closing tag on this page
        page = page.replace('--!>', '-->')

        root = html.fromstring(page)

        path = '//table//tr'
        roster = root.xpath(path)[1:]
        for row in roster:
            position = ''
            name, district, party, email, room, phone, = row.xpath('td')

            if email.attrib.get('class') == 'vacantmember':
                continue  # Skip any vacant members.

            link = name.xpath('string(a/@href)')
            if len(name) == 1:
                name = name.text_content().strip()
            else:
                position = name.tail.strip()
                name = name[0].text_content().strip()
            if '--' in name:
                name = name.split('--')[0].strip()

            linkpage = self.get(link).text
            linkpage = linkpage.replace('--!>', '-->')
            linkroot = html.fromstring(linkpage)
            linkroot.make_links_absolute(link)

            photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]")

            if len(photos) != 1:
                self.warning('no photo on ' + link)
                photo_url = ''
            else:
                photo_url = photos[0].attrib['src']

            district = district.text_content().strip()
            party = party.text_content().strip()
            email = email.text_content().strip()

            if email.startswith('Email: '):
                email = email.replace('Email: ', '').lower() + '@azleg.gov'
            else:
                email = ''

            party = self.get_party(party)
            room = room.text_content().strip()
            if chamber == 'lower':
                address = "House of Representatives\n"
            else:
                address = "Senate\n"
            address = address + "1700 West Washington\n Room " + room  \
                              + "\nPhoenix, AZ 85007"

            phone = phone.text_content().strip()
            if '602' not in re.findall(r'(\d+)', phone):
                phone = "602-" + phone

            leg = Person(primary_org=chamber, image=photo_url, name=name, district=district,
                         party=party)
            leg.add_contact_detail(type='address', value=address, note='Capitol Office')
            leg.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            leg.add_party(party=party)
            leg.add_link(link)

            if email:
                leg.add_contact_detail(type='email', value=email)
            if position:
                leg.add_membership(name_or_org=party, role=position)
                # leg.add_role(position, term, chamber=chamber,
                #             district=district, party=party)

            leg.add_source(url)

            # Probably just get this from the committee scraper
            # self.scrape_member_page(link, session, chamber, leg)
            yield leg
예제 #27
0
    def scrape(self):
        url = 'http://alpha.openstates.org/graphql'
        scrapers = [
            {
                'query':
                '{ people(memberOf:"ocd-organization/e91db6f8-2232-49cd-91af-fdb5adb4ac3b", first: 100) { edges { node { name party: currentMemberships(classification:"party") { organization { name }} links { url } sources { url } chamber: currentMemberships(classification:["upper", "lower"]) { post { label } organization { name classification parent { name }}}}}}}'
            },
            #            { 'query': '{ people(memberOf:"ocd-organization/e91db6f8-2232-49cd-91af-fdb5adb4ac3b", last: 100) { edges { node { name party: currentMemberships(classification:"party") { organization { name }} links { url } sources { url } chamber: currentMemberships(classification:["upper", "lower"]) { post { label } organization { name classification parent { name }}}}}}}'},
            {
                'query':
                '{ people(memberOf:"ocd-organization/6a026144-758d-4d57-b856-9c60dce3c4b5", first: 100) { edges { node { name party: currentMemberships(classification:"party") { organization { name }} links { url } sources { url } chamber: currentMemberships(classification:["upper", "lower"]) { post { label } organization { name classification parent { name }}}}}}}'
            },
        ]

        base = requests.get(url=url, json=scrapers[0])
        base = base.json()
        ppl = base['data']['people']['edges']
        for p in ppl:
            p = p['node']
            if p['name'] in rep_names:
                rep_names.remove(p['name'])

        # Get names unretrieved from primary House API Query
        print('REP NAMES: ', rep_names)
        rep_names.remove('Gene Pelowski')

        for rep in rep_names:
            query = '{ people(memberOf:"ocd-organization/e91db6f8-2232-49cd-91af-fdb5adb4ac3b", first: 100, name: "' + rep + '") { edges { node { name party: currentMemberships(classification:"party") { organization { name }} links { url } sources { url } chamber: currentMemberships(classification:["upper", "lower"]) { post { label } organization { name classification parent { name }}}}}}}'
            query = {'query': query}
            scrapers.append(query)
        for s in scrapers:
            base = requests.get(url=url, json=s)
            base = base.json()
            print(base)
            ppl = base['data']['people']['edges']
            for p in ppl:
                p = p['node']
                orgs = p['chamber']
                rep = Person(name=p['name'], role='State Representative')
                for o in orgs:
                    ppr(o)
                    name = o['organization']['name']
                    classification = o['organization']['classification']
                    if o['organization']['parent']:
                        pname = o['organization']['parent']['name']
                        if pname == 'Minnesota Legislature':
                            label = o['post']['label']
                            if 'House' in name:
                                role = 'State Representative'
                            elif 'Senate' in name:
                                role = 'State Senator'
                            rep.add_term(role,
                                         classification,
                                         district=label,
                                         org_name=name)
                            rep.add_source(p['sources'][0]['url'])

                        else:
                            rep.add_membership(name)
                            rep.add_source(p['sources'][0]['url'])
                yield rep
예제 #28
0
파일: people.py 프로젝트: rjm328/openstates
    def scrape(self):
        # chambers = [chamber] if chamber is not None else ['upper', 'lower']
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        page = self.get(leg_url)

        committees = {}

        # Ensure that the spreadsheet's structure hasn't generally changed
        _row_headers = page.text.split('\r\n')[0].replace('"', '').split(',')
        assert _row_headers == HEADERS, "Spreadsheet structure may have changed"

        page = open_csv(page)
        for row in page:

            chamber = {'H': 'lower', 'S': 'upper'}[row['office code']]

            district = row['dist'].lstrip('0')
            assert district.isdigit(), "Invalid district found: {}".format(
                district)

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            leg = Person(primary_org=chamber,
                         name=name,
                         district=district,
                         party=party)

            legislator_url = row['URL'].replace('\\', '//').strip()
            if legislator_url != '':
                if not legislator_url.startswith('http'):
                    legislator_url = 'http://'
                leg.add_link(legislator_url)

            leg.add_party(party=party)

            office_address = "%s\nRoom %s\nHartford, CT 06106" % (
                row['capitol street address'], row['room number'])
            # extra_office_fields = dict()
            email = row['email'].strip()
            if "@" not in email:
                if not email:
                    email = None
                elif email.startswith('http://') or email.startswith(
                        'https://'):
                    # extra_office_fields['contact_form'] = email
                    email = None
                else:
                    raise ValueError(
                        "Problematic email found: {}".format(email))
            leg.add_contact_detail(type='address',
                                   value=office_address,
                                   note='Capitol Office')
            leg.add_contact_detail(type='voice',
                                   value=row['capitol phone'],
                                   note='Capitol Office')
            if email:
                leg.add_contact_detail(type='email', value=email)

            home_address = "{}\n{}, {} {}".format(
                row['home street address'],
                row['home city'],
                row['home state'],
                row['home zip code'],
            )
            if "Legislative Office Building" not in home_address:
                leg.add_contact_detail(type='address',
                                       value=home_address,
                                       note='District Office')
                if row['home phone'].strip():
                    leg.add_contact_detail(type='voice',
                                           value=row['home phone'],
                                           note='District Office')
            leg.add_source(leg_url)

            for comm_name in row['committee member1'].split(';'):
                if ' (' in comm_name:
                    comm_name, role = comm_name.split(' (')
                    role = role.strip(')').lower()
                else:
                    role = 'member'
                comm_name = comm_name.strip()
                if comm_name:
                    if comm_name in committees:
                        com = committees[comm_name]
                    else:
                        com = Organization(comm_name,
                                           classification='committee',
                                           chamber=chamber)
                        com.add_source(leg_url)
                        committees[comm_name] = com
                        yield com

                    leg.add_membership(name_or_org=com, role=role)

            yield leg
예제 #29
0
    def scrape_legislator_page(self, term, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        name = page.xpath("//h1[@id='page-title']/text()")[0]
        name = re.sub(r'^(Representative|Senator)\s', '', name)
        district = page.xpath("//a[contains(@href, 'district')]/text()")[0]
        district = district.replace("District", "").strip()

        committees = page.xpath("//a[contains(@href, 'committees')]/text()")

        photo = page.xpath(
            "//div[@class='field-person-photo']/img/@src"
        )
        photo = photo[0] if len(photo) else None

        address = page.xpath("//div[@class='adr']")
        if address:
            address = address[0]
            address = re.sub("[ \t]+", " ", address.text_content()).strip()
        else:
            address = None

        item_mapping = {
            "email": "email",
            "home telephone": "home-telephone",
            "cellphone": "cellphone",
            "office telephone": "office-telephone",
            "political party": "party",
            "chamber": "chamber",
            "fax": "fax"
        }
        metainf = {}

        for block in page.xpath("//div[contains(@class, 'field-label-inline')]"):
            label, items = block.xpath("./*")
            key = label.text_content().strip().lower()
            if key.endswith(":"):
                key = key[:-1]

            metainf[item_mapping[key]] = items.text_content().strip()

        chamber = {
            "Senate": "upper",
            "House": "lower"
        }[metainf['chamber']]

        party = {"Democrat": "Democratic", "Republican": "Republican"}[metainf['party']]

        person = Person(primary_org=chamber,
                        district=district,
                        name=name,
                        party=party,
                        image=photo)
        person.add_link(url)
        for key, person_key in [('email', 'email'),
                                ('fax', 'fax'),
                                ('office-telephone', 'voice')]:
            if key in metainf:
                if metainf[key].strip():
                    person.add_contact_detail(type=person_key,
                                              value=metainf[key],
                                              note="Capitol Office")
        if address:
            person.add_contact_detail(type='address',
                                      value=address,
                                      note="District Office")
        if 'cellphone' in metainf:
            person.add_contact_detail(type='voice',
                                      value=metainf['cellphone'],
                                      note="District Office")
        if 'home-telephone' in metainf:
            person.add_contact_detail(type='voice',
                                      value=metainf['home-telephone'],
                                      note="District Office")

        for committee in committees:
            person.add_membership(name_or_org=committee, role='committee member')
        person.add_source(url)
        yield person
예제 #30
0
    def scrape(self):
        '''
        Scrape the web to create a dict with all active organizations.
        Then, we can access the correct URL for the organization detail page.
        '''
        web_scraper = LegistarPersonScraper(
            requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx'
        web_info = {}

        for _, organizations in web_scraper.councilMembers():
            for organization, _, _ in organizations:
                organization_name = organization['Department Name'][
                    'label'].strip()
                organization_info = organization['Department Name']

                web_info[organization_name] = organization_info

        body_types = self.body_types()

        board_of_directors, = [
            body for body in self.bodies()
            if body['BodyName'] == 'Board of Directors - Regular Board Meeting'
        ]
        board_of_directors["BodyName"] = "Board of Directors"

        terms = collections.defaultdict(list)
        for office in self.body_offices(board_of_directors):
            terms[office['OfficeRecordFullName']].append(office)

        members = {}
        for member, offices in terms.items():
            p = Person(member)

            for term in offices:
                role = term['OfficeRecordTitle']

                if role not in {'Board Member', 'non-voting member'}:
                    p.add_term(
                        role,
                        'legislature',
                        start_date=self.toDate(term['OfficeRecordStartDate']),
                        end_date=self.toDate(term['OfficeRecordEndDate']),
                        appointment=True)

                if role != 'Chief Executive Officer':
                    if role == 'non-voting member':
                        member_type = 'Nonvoting Board Member'
                        post = NONVOTING_POSTS.get(member)
                    else:
                        member_type = 'Board Member'
                        post = VOTING_POSTS.get(member)

                    start_date = self.toDate(term['OfficeRecordStartDate'])
                    end_date = self.toDate(term['OfficeRecordEndDate'])
                    board_membership = p.add_term(member_type,
                                                  'legislature',
                                                  district=post,
                                                  start_date=start_date,
                                                  end_date=end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(
                        p.name)

                    if acting_member_end_date and acting_member_end_date <= end_date:
                        board_membership.extras = {'acting': 'true'}

            # Each term contains first and last names. This should be the same
            # across all of a person's terms, so go ahead and grab them from the
            # last term in the array.
            p.family_name = term['OfficeRecordLastName']
            p.given_name = term['OfficeRecordFirstName']

            # Defensively assert that the given and family names match the
            # expected value.
            if member == 'Hilda L. Solis':
                # Given/family name does not contain middle initial.
                assert p.given_name == 'Hilda' and p.family_name == 'Solis'
            else:
                assert member == ' '.join([p.given_name, p.family_name])

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls

            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] in (
                    body_types['Committee'],
                    body_types['Independent Taxpayer Oversight Committee']):
                organization_name = body['BodyName'].strip()
                o = Organization(organization_name,
                                 classification='committee',
                                 parent_id={'name': 'Board of Directors'})

                organization_info = web_info.get(organization_name, {})
                organization_url = organization_info.get(
                    'url', self.WEB_URL +
                    'https://metro.legistar.com/Departments.aspx')

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(organization_url, note='web')

                for office in self.body_offices(body):
                    role = office['OfficeRecordTitle']

                    if role not in BOARD_OFFICE_ROLES:
                        if role == 'non-voting member':
                            role = 'Nonvoting Member'
                        else:
                            role = 'Member'

                    person = office['OfficeRecordFullName']

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    start_date = self.toDate(office['OfficeRecordStartDate'])
                    end_date = self.toDate(office['OfficeRecordEndDate'])
                    membership = p.add_membership(organization_name,
                                                  role=role,
                                                  start_date=start_date,
                                                  end_date=end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(
                        p.name)
                    if acting_member_end_date and acting_member_end_date <= end_date:
                        membership.extras = {'acting': 'true'}

                yield o

        for p in members.values():
            yield p
예제 #31
0
    def scrape_chamber(self, chamber):
        if chamber == "lower":
            url = "http://www.scstatehouse.gov/member.php?chamber=H"
        else:
            url = "http://www.scstatehouse.gov/member.php?chamber=S"

        seen_committees = {}

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        for a in doc.xpath('//a[@class="membername"]'):
            full_name = a.text
            leg_url = a.get("href")

            if full_name.startswith("Senator"):
                full_name = full_name.replace("Senator ", "")
            if full_name.startswith("Representative"):
                full_name = full_name.replace("Representative ", "")

            leg_html = self.get(leg_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            if "Resigned effective" in leg_html:
                self.info("Resigned")
                continue

            party, district, _ = leg_doc.xpath(
                '//p[@style="font-size: 17px;'
                ' margin: 0 0 0 0; padding: 0;"]/text()')

            if "Republican" in party:
                party = "Republican"
            elif "Democrat" in party:
                party = "Democratic"

            # District # - County - Map
            district = district.split()[1]
            try:
                photo_url = leg_doc.xpath(
                    '//img[contains(@src,"/members/")]/@src')[0]
            except IndexError:
                self.warning("No Photo URL for {}".format(full_name))
                photo_url = ""
            person = Person(
                name=full_name,
                district=district,
                party=party,
                primary_org=chamber,
                image=photo_url,
            )

            # office address / phone
            try:
                addr_div = leg_doc.xpath(
                    '//div[@style="float: left; width: 225px;'
                    ' margin: 10px 5px 0 20px; padding: 0;"]')[0]
                capitol_address = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content()

                phone = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                capitol_phone = phone.strip()

                if capitol_address:
                    person.add_contact_detail(type="address",
                                              value=capitol_address,
                                              note="Capitol Office")

                if capitol_phone:
                    person.add_contact_detail(type="voice",
                                              value=capitol_phone,
                                              note="Capitol Office")
            except IndexError:
                self.warning("no capitol address for {0}".format(full_name))

            # home address / phone
            try:
                addr_div = leg_doc.xpath(
                    '//div[@style="float: left;'
                    ' width: 225px; margin: 10px 0 0 20px;"]')[0]
                addr = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content()

                phone = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                phone = phone.strip()
                if addr:
                    person.add_contact_detail(type="address",
                                              value=addr,
                                              note="District Office")

                if phone:
                    person.add_contact_detail(type="voice",
                                              value=phone,
                                              note="District Office")
            except IndexError:
                self.warning("no district address for {0}".format(full_name))

            person.add_link(leg_url)
            person.add_source(url)
            person.add_source(leg_url)

            # committees (skip first link)
            for com in leg_doc.xpath(
                    '//a[contains(@href, "committee.php")]')[1:]:
                if com.text.endswith(", "):
                    committee, role = com.text_content().rsplit(", ", 1)

                    # known roles
                    role = {
                        "Treas.": "treasurer",
                        "Secy.": "secretary",
                        "Secy./Treas.": "secretary/treasurer",
                        "V.C.": "vice-chair",
                        "1st V.C.": "first vice-chair",
                        "Co 1st V.C.": "co-first vice-chair",
                        "2nd V.C.": "second vice-chair",
                        "3rd V.C.": "third vice-chair",
                        "Ex.Officio Member": "ex-officio member",
                        "Chairman": "chairman",
                    }[role]
                else:
                    committee = com.text
                    role = "member"

                # only yield each committee once
                if committee not in seen_committees:
                    com = Organization(name=committee,
                                       classification="committee",
                                       chamber=chamber)
                    com.add_source(url)
                    seen_committees[committee] = com
                    yield com
                else:
                    com = seen_committees[committee]

                person.add_membership(com, role=role)

            yield person
예제 #32
0
    def scrape_chamber(self, chamber):
        body = {"lower": "H", "upper": "S"}[chamber]
        url = "http://www.azleg.gov/MemberRoster/?body=" + body
        page = self.get(url).text

        # there is a bad comment closing tag on this page
        page = page.replace("--!>", "-->")

        root = html.fromstring(page)

        path = "//table//tr"
        roster = root.xpath(path)[1:]
        for row in roster:
            position = ""
            name, district, party, email, room, phone, = row.xpath("td")

            if email.attrib.get("class") == "vacantmember":
                continue  # Skip any vacant members.

            link = name.xpath("string(a/@href)")
            if len(name) == 1:
                name = name.text_content().strip()
            else:
                position = name.tail.strip()
                name = name[0].text_content().strip()
            if "--" in name:
                name = name.split("--")[0].strip()

            linkpage = self.get(link).text
            linkpage = linkpage.replace("--!>", "-->")
            linkroot = html.fromstring(linkpage)
            linkroot.make_links_absolute(link)

            photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]")

            if len(photos) != 1:
                self.warning("no photo on " + link)
                photo_url = ""
            else:
                photo_url = photos[0].attrib["src"]

            district = district.text_content().strip()
            party = party.text_content().strip()
            email = email.text_content().strip()

            if email.startswith("Email: "):
                email = email.replace("Email: ", "").lower() + "@azleg.gov"
            else:
                email = ""

            party = self.get_party(party)
            room = room.text_content().strip()
            if chamber == "lower":
                address = "House of Representatives\n"
            else:
                address = "Senate\n"
            address = (address + "1700 West Washington\n Room " + room +
                       "\nPhoenix, AZ 85007")

            phone = phone.text_content().strip()
            if "602" not in re.findall(r"(\d+)", phone):
                phone = "602-" + phone

            leg = Person(
                primary_org=chamber,
                image=photo_url,
                name=name,
                district=district,
                party=party,
            )
            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="Capitol Office")
            leg.add_contact_detail(type="voice",
                                   value=phone,
                                   note="Capitol Office")
            leg.add_party(party=party)
            leg.add_link(link)

            if email:
                leg.add_contact_detail(type="email", value=email)
            if position:
                leg.add_membership(name_or_org=party, role=position)
                # leg.add_role(position, term, chamber=chamber,
                #             district=district, party=party)

            leg.add_source(url)

            # Probably just get this from the committee scraper
            # self.scrape_member_page(link, session, chamber, leg)
            yield leg
예제 #33
0
    def scrape_chamber(self, chamber):
        if chamber == 'lower':
            url = 'http://www.scstatehouse.gov/member.php?chamber=H'
        else:
            url = 'http://www.scstatehouse.gov/member.php?chamber=S'

        seen_committees = {}

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        for a in doc.xpath('//a[contains(@href, "code=")]'):
            full_name = a.text
            leg_url = a.get('href')

            leg_html = self.get(leg_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            if 'Resigned effective' in leg_html:
                self.info('Resigned')
                continue

            party, district, _ = leg_doc.xpath('//p[@style="font-size: 17px;'
                                               ' margin: 0 0 0 0; padding: 0;"]/text()')

            if 'Republican' in party:
                party = 'Republican'
            elif 'Democrat' in party:
                party = 'Democratic'

            # District # - County - Map
            district = district.split()[1]

            photo_url = leg_doc.xpath('//img[contains(@src,"/members/")]/@src')[0]

            person = Person(name=full_name, district=district,
                            party=party, primary_org=chamber,
                            image=photo_url)

            # office address / phone
            try:
                addr_div = leg_doc.xpath('//div[@style="float: left; width: 225px;'
                                         ' margin: 10px 5px 0 20px; padding: 0;"]')[0]
                capitol_address = addr_div.xpath('p[@style="font-size: 13px;'
                                                 ' margin: 0 0 10px 0; padding: 0;"]'
                                                 )[0].text_content()

                phone = addr_div.xpath('p[@style="font-size: 13px;'
                                       ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                capitol_phone = phone.strip()

                if capitol_address:
                    person.add_contact_detail(type='address', value=capitol_address,
                                              note='Capitol Office')

                if capitol_phone:
                    person.add_contact_detail(type='voice', value=capitol_phone,
                                              note='Capitol Office')
            except IndexError:
                self.warning('no capitol address for {0}'.format(full_name))

            # home address / phone
            try:
                addr_div = leg_doc.xpath('//div[@style="float: left;'
                                         ' width: 225px; margin: 10px 0 0 20px;"]')[0]
                addr = addr_div.xpath('p[@style="font-size: 13px;'
                                      ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content()

                phone = addr_div.xpath('p[@style="font-size: 13px;'
                                       ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                phone = phone.strip()
                if addr:
                    person.add_contact_detail(type='address', value=addr,
                                              note='District Office')

                if phone:
                    person.add_contact_detail(type='voice', value=phone,
                                              note='District Office')
            except IndexError:
                self.warning('no district address for {0}'.format(full_name))

            person.add_link(leg_url)
            person.add_source(url)
            person.add_source(leg_url)

            # committees (skip first link)
            for com in leg_doc.xpath('//a[contains(@href, "committee.php")]')[1:]:
                if com.text.endswith(', '):
                    committee, role = com.text_content().rsplit(', ', 1)

                    # known roles
                    role = {'Treas.': 'treasurer',
                            'Secy.': 'secretary',
                            'Secy./Treas.': 'secretary/treasurer',
                            'V.C.': 'vice-chair',
                            '1st V.C.': 'first vice-chair',
                            'Co 1st V.C.': 'co-first vice-chair',
                            '2nd V.C.': 'second vice-chair',
                            '3rd V.C.': 'third vice-chair',
                            'Ex.Officio Member': 'ex-officio member',
                            'Chairman': 'chairman'}[role]
                else:
                    committee = com.text
                    role = 'member'

                # only yield each committee once
                if committee not in seen_committees:
                    com = Organization(name=committee, classification='committee',
                                       chamber=chamber)
                    com.add_source(url)
                    seen_committees[committee] = com
                    yield com
                else:
                    com = seen_committees[committee]

                person.add_membership(com, role=role)

            yield person