Пример #1
0
def refresh_people(jurisdiction):
    s = LegistarPersonScraper()
    MEMBERLIST = 'https://{}.legistar.com/People.aspx'.format(jurisdiction)

    page = next(s.pages(MEMBERLIST))

    save_page(page, jurisdiction, 'people.html')
def refresh_people(jurisdiction):
    s = LegistarPersonScraper()
    MEMBERLIST = 'https://{}.legistar.com/People.aspx'.format(jurisdiction)

    page = next(s.pages(MEMBERLIST))

    save_page(page, jurisdiction, 'people.html')
def test_parse_people(project_directory, mocker, jurisdiction):
    events_fixture = os.path.join(project_directory, 'tests', 'fixtures', jurisdiction, 'people.html')

    scraper = LegistarPersonScraper()
    scraper.BASE_URL = '{}.legistar.com'.format(jurisdiction)

    with open(events_fixture, 'r') as f:
        page = lxml.html.fromstring(f.read())
        mocker.patch.object(scraper, 'pages', return_value=page)
        result = next(scraper.councilMembers(follow_links=False))
        print(result)
def test_parse_people(project_directory, mocker, jurisdiction):
    events_fixture = os.path.join(project_directory, 'tests', 'fixtures',
                                  jurisdiction, 'people.html')

    scraper = LegistarPersonScraper()
    scraper.BASE_URL = '{}.legistar.com'.format(jurisdiction)

    with open(events_fixture, 'r') as f:
        page = lxml.html.fromstring(f.read())
        mocker.patch.object(scraper, 'pages', return_value=page)
        result = next(scraper.councilMembers(follow_links=False))
        print(result)
Пример #5
0
    def scrape(self):
        '''
        Scrape the web to create a dict with all active organizations.
        Then, we can access the correct URL for the organization detail page.
        '''
        web_scraper = LegistarPersonScraper(
            requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx'
        web_info = {}

        for _, organizations in web_scraper.councilMembers():
            for organization, _, _ in organizations:
                organization_name = organization['Department Name'][
                    'label'].strip()
                organization_info = organization['Department Name']

                web_info[organization_name] = organization_info

        body_types = self.body_types()

        board_of_directors, = [
            body for body in self.bodies()
            if body['BodyName'] == 'Board of Directors - Regular Board Meeting'
        ]
        board_of_directors["BodyName"] = "Board of Directors"

        terms = collections.defaultdict(list)
        for office in self.body_offices(board_of_directors):
            terms[office['OfficeRecordFullName']].append(office)

        members = {}
        for member, offices in terms.items():
            p = Person(member)

            for term in offices:
                role = term['OfficeRecordTitle']

                if role not in {'Board Member', 'non-voting member'}:
                    p.add_term(
                        role,
                        'legislature',
                        start_date=self.toDate(term['OfficeRecordStartDate']),
                        end_date=self.toDate(term['OfficeRecordEndDate']),
                        appointment=True)

                if role != 'Chief Executive Officer':
                    if role == 'non-voting member':
                        member_type = 'Nonvoting Board Member'
                        post = NONVOTING_POSTS.get(member)
                    else:
                        member_type = 'Board Member'
                        post = VOTING_POSTS.get(member)

                    start_date = self.toDate(term['OfficeRecordStartDate'])
                    end_date = self.toDate(term['OfficeRecordEndDate'])
                    board_membership = p.add_term(member_type,
                                                  'legislature',
                                                  district=post,
                                                  start_date=start_date,
                                                  end_date=end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(
                        p.name)

                    if acting_member_end_date and acting_member_end_date <= end_date:
                        board_membership.extras = {'acting': 'true'}

            # Each term contains first and last names. This should be the same
            # across all of a person's terms, so go ahead and grab them from the
            # last term in the array.
            p.family_name = term['OfficeRecordLastName']
            p.given_name = term['OfficeRecordFirstName']

            # Defensively assert that the given and family names match the
            # expected value.
            if member == 'Hilda L. Solis':
                # Given/family name does not contain middle initial.
                assert p.given_name == 'Hilda' and p.family_name == 'Solis'
            else:
                assert member == ' '.join([p.given_name, p.family_name])

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls

            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] in (
                    body_types['Committee'],
                    body_types['Independent Taxpayer Oversight Committee']):
                organization_name = body['BodyName'].strip()
                o = Organization(organization_name,
                                 classification='committee',
                                 parent_id={'name': 'Board of Directors'})

                organization_info = web_info.get(organization_name, {})
                organization_url = organization_info.get(
                    'url', self.WEB_URL +
                    'https://metro.legistar.com/Departments.aspx')

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(organization_url, note='web')

                for office in self.body_offices(body):
                    role = office['OfficeRecordTitle']

                    if role not in BOARD_OFFICE_ROLES:
                        if role == 'non-voting member':
                            role = 'Nonvoting Member'
                        else:
                            role = 'Member'

                    person = office['OfficeRecordFullName']

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    start_date = self.toDate(office['OfficeRecordStartDate'])
                    end_date = self.toDate(office['OfficeRecordEndDate'])
                    membership = p.add_membership(organization_name,
                                                  role=role,
                                                  start_date=start_date,
                                                  end_date=end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(
                        p.name)
                    if acting_member_end_date and acting_member_end_date <= end_date:
                        membership.extras = {'acting': 'true'}

                yield o

        for p in members.values():
            yield p
Пример #6
0
    def scrape(self):
        body_types = self.body_types()

        city_council, = [
            body for body in self.bodies()
            if body['BodyName'] == 'COMMON COUNCIL'
        ]

        terms = collections.defaultdict(list)
        for office in self.body_offices(city_council):
            terms[office['OfficeRecordFullName'].strip()].append(office)

        web_scraper = LegistarPersonScraper(
            requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'https://milwaukee.legistar.com/DepartmentDetail.aspx?ID=1998&GUID=74273156-5389-46F3-9D09-3D850BDE32A1'
        #web_scraper.ALL_MEMBERS = '3:3'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}
        for member, _ in web_scraper.councilMembers(
            {'ctl00$ContentPlaceHolder$lstName': 'COMMON COUNCIL'}):
            web_info[member['Person Name']['label']] = member

        members = {}
        for member, offices in terms.items():
            web = web_info[member]
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']
                district = re.search('(?<=(/{1}district))[\w]+',
                                     web['Website']['url']).group(0)
                p.add_term('Alderman',
                           'legislature',
                           district="District {}".format(int(district)),
                           start_date=self.toDate(
                               term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))
            if web.get('Photo'):
                p.image = web['Photo']

            contact_types = {
                "City Hall Address": ("address", "City Hall Address"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "District Office Phone": ("voice", "District Office Phone"),
                "District Office Address":
                ("address", "District Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contat_ctypes.items():
                if web[contact_type] and web[contact_type] != 'N/A':
                    p.add_contact_detail(type=type_,
                                         value=web[contact_type],
                                         note=_note)

            if web['E-mail'] and web['E-mail'][
                    'label'] and web['E-mail']['label'] != 'N/A':
                p.add_contact_detail(type='email',
                                     value=web['E-mail']['label'],
                                     note='E-mail')

            if web["Website"]:
                p.add_link(web["Website"]['url'])

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')
Пример #7
0
    def scrape(self):
        body_types = self.body_types()

        city_council, = [
            body for body in self.bodies()
            if body['BodyName'] == 'City Council'
        ]

        terms = collections.defaultdict(list)
        for office in self.body_offices(city_council):
            if 'VACAN' not in office['OfficeRecordFullName']:
                terms[office['OfficeRecordFullName'].strip()].append(office)

        web_scraper = LegistarPersonScraper(None, None)
        web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881'
        web_scraper.ALL_MEMBERS = '3:3'

        web_info = {}
        for member, _ in web_scraper.councilMembers(
            {'ctl00$ContentPlaceHolder$lstName': 'City Council'}):
            web_info[member['Person Name']['label']] = member

        web_info['Balcer, James'] = collections.defaultdict(lambda: None)
        web_info['Fioretti, Bob'] = collections.defaultdict(lambda: None)
        web_info['Balcer, James']['Ward/Office'] = 11
        web_info['Fioretti, Bob']['Ward/Office'] = 2

        members = {}
        for member, offices in terms.items():
            web = web_info[member]
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']
                p.add_term('Alderman',
                           'legislature',
                           district="Ward {}".format(int(web['Ward/Office'])),
                           start_date=self.toDate(
                               term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

            if web.get('Photo'):
                p.image = web['Photo']

            contact_types = {
                "City Hall Address": ("address", "City Hall Address"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "Ward Office Phone": ("voice", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contact_types.items():
                if web[contact_type] and web[contact_type] != 'N/A':
                    p.add_contact_detail(type=type_,
                                         value=web[contact_type],
                                         note=_note)

            if web["E-mail"] and web["E-mail"][
                    "label"] and web["E-mail"]["label"] != 'N/A':
                p.add_contact_detail(type="email",
                                     value=web['E-mail']['label'],
                                     note='E-mail')

            if web['Website']:
                p.add_link(web['Website']['url'])

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name': 'Chicago City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                for office in self.body_offices(body):
                    # messed up record for joanna thompson
                    if office['OfficeRecordId'] == 1055:
                        continue

                    role = office['OfficeRecordTitle']
                    if role not in ("Vice Chair", "Chairman"):
                        role = 'Member'

                    person = office['OfficeRecordFullName'].strip()
                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(body['BodyName'],
                                     role=role,
                                     start_date=self.toDate(
                                         office['OfficeRecordStartDate']),
                                     end_date=self.toDate(
                                         office['OfficeRecordEndDate']))

                yield o

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Joint Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name': 'Chicago City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                yield o

        for p in members.values():
            yield p
Пример #8
0
    def scrape(self):
        web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute)
        web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}

        for member, _ in web_scraper.councilMembers():
            name = member['Person Name']['label'].strip()
            web_info[name] = member

        city_council, = [body for body in self.bodies()
                         if body['BodyName'] == 'City Council']

        terms = collections.defaultdict(list)

        public_advocates = {  # Match casing to Bill De Blasio as council member
            'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio',
            'The Public Advocate (Ms. James)': 'Letitia James',
        }

        for office in self.body_offices(city_council):
            name = office['OfficeRecordFullName']
            name = public_advocates.get(name, name).strip()

            terms[name].append(office)

            # Add past members (and advocates public)
            if name not in web_info:
                web_info[name] = collections.defaultdict(lambda: None)

        # Check that we have everyone we expect, formatted consistently, in
        # both information arrays. For instance, this will fail if we forget to
        # strip trailing spaces from names on one side or the other (which has
        # the effect of omitting information, such as post, from the scrape).

        assert set(web_info.keys()) == set(terms.keys())

        members = {}

        for member, offices in terms.items():

            p = Person(member)

            web = web_info[member]

            for term in offices:
                role = term['OfficeRecordTitle']

                if role == 'Public Advocate':
                    role = 'Non-Voting Council Member'
                else:
                    role = 'Council Member'

                district = web.get('District', '').replace(' 0', ' ')

                p.add_term(role,
                           'legislature',
                           district=district,
                           start_date=self.toDate(term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

                party = web.get('Political Party')

                if party == 'Democrat':
                    party = 'Democratic'

                if party:
                    p.add_party(party)

                if web.get('Photo'):
                    p.image = web['Photo']

                contact_types = {
                    "City Hall Office": ("address", "City Hall Office"),
                    "City Hall Phone": ("voice", "City Hall Phone"),
                    "Ward Office Phone": ("voice", "Ward Office Phone"),
                    "Ward Office Address": ("address", "Ward Office Address"),
                    "Fax": ("fax", "Fax")
                }

                for contact_type, (type_, _note) in contact_types.items():
                    if web.get(contact_type) and web(contact_type) != 'N/A':
                        p.add_contact_detail(type=type_,
                                             value= web[contact_type],
                                             note=_note)

                if web.get('E-mail'):
                    p.add_contact_detail(type="email",
                                         value=web['E-mail']['url'],
                                         note='E-mail')

                if web.get('Web site'):
                    p.add_link(web['Web site']['url'], note='web site')

                if web.get('Notes'):
                    p.extras = {'Notes': web['Notes']}

                if not p.sources:  # Only add sources once
                    source_urls = self.person_sources_from_office(term)
                    person_api_url, person_web_url = source_urls
                    p.add_source(person_api_url, note='api')
                    p.add_source(person_web_url, note='web')

            members[member] = p

        committee_types = ['Committee',
                           'Inactive Committee',
                           'Select Committee',
                           'Subcommittee',
                           'Task Force',
                           'Land Use', # Committee on Land Use
                          ]

        body_types = {k: v for k, v in self.body_types().items()
                      if k in committee_types}

        for body in self.bodies():
            if body['BodyTypeName'] in body_types \
                or body['BodyName'] in ('Legislative Documents Unit',
                                        'Legal and Government Affairs Division'):

                # Skip typo in API data
                if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services':
                    continue

                parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council')

                body_name = body['BodyName']

                o = Organization(body_name,
                                 classification='committee',
                                 parent_id={'name': parent_org})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web')

                for office in self.body_offices(body):
                    # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio',
                    # 'Committee Member', None, 'CHAIRPERSON'

                    role = office['OfficeRecordTitle']

                    if role and role.lower() == 'chairperson':
                        role = 'Chairperson'
                    else:
                        role = 'Member'

                    person = office['OfficeRecordFullName']
                    person = public_advocates.get(person, person).strip()

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(o,
                                     role=role,
                                     start_date=self.toDate(office['OfficeRecordStartDate']),
                                     end_date=self.toDate(office['OfficeRecordEndDate']))

                yield o

        for p in members.values():
            yield p
Пример #9
0
    def scrape(self):
        web_scraper = LegistarPersonScraper(
            requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}

        for member, _ in web_scraper.councilMembers():
            name = member['Person Name']['label'].strip()
            web_info[name] = member

        city_council, = [
            body for body in self.bodies()
            if body['BodyName'] == 'City Council'
        ]

        terms = collections.defaultdict(list)

        public_advocates = {  # Match casing to Bill De Blasio as council member
            'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio',
            'The Public Advocate (Ms. James)': 'Letitia James',
        }

        for office in self.body_offices(city_council):
            name = office['OfficeRecordFullName']
            name = public_advocates.get(name, name).strip()

            terms[name].append(office)

            # Add past members (and advocates public)
            if name not in web_info:
                web_info[name] = collections.defaultdict(lambda: None)

        # Check that we have everyone we expect, formatted consistently, in
        # both information arrays. For instance, this will fail if we forget to
        # strip trailing spaces from names on one side or the other (which has
        # the effect of omitting information, such as post, from the scrape).

        assert set(web_info.keys()) == set(terms.keys())

        members = {}

        for member, offices in terms.items():

            p = Person(member)

            web = web_info[member]

            for term in offices:
                role = term['OfficeRecordTitle']

                if role == 'Public Advocate':
                    role = 'Non-Voting Council Member'
                else:
                    role = 'Council Member'

                district = web.get('District', '').replace(' 0', ' ')

                p.add_term(role,
                           'legislature',
                           district=district,
                           start_date=self.toDate(
                               term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

                party = web.get('Political Party')

                if party == 'Democrat':
                    party = 'Democratic'

                if party:
                    p.add_party(party)

                if web.get('Photo'):
                    p.image = web['Photo']

                contact_types = {
                    "City Hall Office": ("address", "City Hall Office"),
                    "City Hall Phone": ("voice", "City Hall Phone"),
                    "Ward Office Phone": ("voice", "Ward Office Phone"),
                    "Ward Office Address": ("address", "Ward Office Address"),
                    "Fax": ("fax", "Fax")
                }

                for contact_type, (type_, _note) in contact_types.items():
                    if web.get(contact_type) and web(contact_type) != 'N/A':
                        p.add_contact_detail(type=type_,
                                             value=web[contact_type],
                                             note=_note)

                if web.get('E-mail'):
                    p.add_contact_detail(type="email",
                                         value=web['E-mail']['url'],
                                         note='E-mail')

                if web.get('Web site'):
                    p.add_link(web['Web site']['url'], note='web site')

                if web.get('Notes'):
                    p.extras = {'Notes': web['Notes']}

                if not p.sources:  # Only add sources once
                    source_urls = self.person_sources_from_office(term)
                    person_api_url, person_web_url = source_urls
                    p.add_source(person_api_url, note='api')
                    p.add_source(person_web_url, note='web')

            members[member] = p

        committee_types = [
            'Committee', 'Inactive Committee', 'Select Committee',
            'Subcommittee', 'Task Force', 'Land Use'
        ]  # Committee on Land Use

        body_types = {
            k: v
            for k, v in self.body_types().items() if k in committee_types
        }

        for body in self.bodies():
            if body['BodyTypeName'] in body_types \
                or body['BodyName'] in ('Legislative Documents Unit',
                                        'Legal and Government Affairs Division'):

                # Skip typo in API data
                if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services':
                    continue

                parent_org = PARENT_ORGS.get(body['BodyName'],
                                             'New York City Council')

                body_name = body['BodyName']

                o = Organization(body_name,
                                 classification='committee',
                                 parent_id={'name': parent_org})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                for office in self.body_offices(body):
                    # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio',
                    # 'Committee Member', None, 'CHAIRPERSON'

                    role = office['OfficeRecordTitle']

                    if role and role.lower() == 'chairperson':
                        role = 'Chairperson'
                    else:
                        role = 'Member'

                    person = office['OfficeRecordFullName']
                    person = public_advocates.get(person, person).strip()

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(o,
                                     role=role,
                                     start_date=self.toDate(
                                         office['OfficeRecordStartDate']),
                                     end_date=self.toDate(
                                         office['OfficeRecordEndDate']))

                yield o

        for p in members.values():
            yield p
Пример #10
0
    def scrape(self):
        body_types = self.body_types()
        city_council, = [body for body in self.bodies()
                         if body["BodyName"] == "City Council"]
        terms = collections.defaultdict(list)

        for office in self.body_offices(city_council):
            if "VACAN" not in office["OfficeRecordFullName"]:
                terms[office["OfficeRecordFullName"].strip()].append(office)

        web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = "https://pittsburgh.legistar.com/People.aspx"
        web_scraper.COMMITTEELIST = "https://pittsburgh.legistar.com/Departments.aspx"

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}
        for member in web_scraper.councilMembers():
            web_info[member["Person Name"]] = member

        members = {}
        for member, offices in terms.items():
            person = Person(member)
            for term in offices:
                role = term["OfficeRecordTitle"]
                person.add_term("Councilmember",
                                "legislature",
                                start_date = self.toDate(term["OfficeRecordStartDate"]),
                                end_date = self.toDate(term["OfficeRecordEndDate"]))

            if member in web_info:
                web = web_info[member]
                if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != "N/A":
                    person.add_contact_detail(type="email",
                                        value=web["E-mail"]["label"],
                                        note="E-mail")

            person_source_data = self.person_sources_from_office(term)
            person_api_url, person_api_response = person_source_data
            person.add_source(person_api_url, note="api")

            if person_api_response["PersonAddress1"]:
                address = (person_api_response["PersonAddress1"] + ", " + person_api_response["PersonCity1"]
                          + ", " + person_api_response["PersonState1"] + " " + person_api_response["PersonZip1"])
                person.add_contact_detail(type="address",
                                    value=address,
                                    note="Office address")

            if person_api_response["PersonPhone"]:
                person.add_contact_detail(type="voice",
                                    value=person_api_response["PersonPhone"],
                                    note="Office phone")

            if person_api_response["PersonWWW"]:
                person.add_contact_detail(type="url",
                                    value=person_api_response["PersonWWW"],
                                    note="District website")

            members[member] = person


        for body in self.bodies():
            if body["BodyTypeId"] == body_types["Committee"]:
                body_name_clean = body["BodyName"].strip()
                organization = Organization(body_name_clean,
                             classification="committee",
                             parent_id={"name" : "Pittsburgh City Council"})

                organization.add_source(self.BASE_URL + "/bodies/{BodyId}".format(**body), note="api")

                for office in self.body_offices(body):
                    role = office["OfficeRecordMemberType"]
                    if role not in ("Vice Chair", "Chair") or role == "Councilmember":
                        role = "Member"

                    person = office["OfficeRecordFullName"].strip()
                    if person in members:
                        person = members[person]
                    else:
                        person = Person(person)

                    person.add_membership(body_name_clean,
                                     role=role,
                                     start_date = self.toDate(office["OfficeRecordStartDate"]),
                                     end_date = self.toDate(office["OfficeRecordEndDate"]))

                yield organization

        for person in members.values():
            yield person
Пример #11
0
    def scrape(self):
        body_types = self.body_types()

        city_council, = [body for body in self.bodies()
                         if body['BodyName'] == 'City Council']

        terms = collections.defaultdict(list)
        for office in self.body_offices(city_council):
            if 'vacan' not in office['OfficeRecordFullName'].lower():
                terms[office['OfficeRecordFullName'].strip()].append(office)

        web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute)
        web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881'
        web_scraper.ALL_MEMBERS = '3:3'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False


        web_info = {}
        for member, _ in web_scraper.councilMembers({'ctl00$ContentPlaceHolder$lstName' : 'City Council'}):
            web_info[member['Person Name']['label']] = member


        web_info['Balcer, James'] = collections.defaultdict(lambda : None)
        web_info['Fioretti, Bob'] = collections.defaultdict(lambda : None)
        web_info['Balcer, James']['Ward/Office'] = 11
        web_info['Fioretti, Bob']['Ward/Office'] = 2
        
        members = {}
        for member, offices in terms.items():
            web = web_info[member]
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']
                p.add_term('Alderman',
                           'legislature',
                           district = "Ward {}".format(int(web['Ward/Office'])),
                           start_date = self.toDate(term['OfficeRecordStartDate']),
                           end_date = self.toDate(term['OfficeRecordEndDate']))

            if web.get('Photo'):
                p.image = web['Photo']

            contact_types = {
                "City Hall Address": ("address", "City Hall Address"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "Ward Office Phone": ("voice", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contact_types.items():
                if web[contact_type] and web[contact_type] != 'N/A':
                    p.add_contact_detail(type=type_,
                                         value= web[contact_type],
                                         note=_note)

            if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != 'N/A':
                p.add_contact_detail(type="email",
                                     value=web['E-mail']['label'],
                                     note='E-mail')


            if web['Website']:
                p.add_link(web['Website']['url'])

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')


            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name' : 'Chicago City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web')

                for office in self.body_offices(body):
                    # messed up record for joanna thompson
                    if office['OfficeRecordId'] == 1055:
                        continue
                        
                    role = office['OfficeRecordTitle']
                    if role not in ("Vice Chair", "Chairman"):
                        role = 'Member'

                    person = office['OfficeRecordFullName'].strip()
                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)
                        
                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    try:
                        end_date = self.toDate(office['OfficeRecordEndDate'])
                    except TypeError:
                        end_date = ''
                    p.add_membership(body['BodyName'],
                                     role=role,
                                     start_date=self.toDate(office['OfficeRecordStartDate']),
                                     end_date=end_date)

                yield o

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Joint Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name' : 'Chicago City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web')

                yield o        

        for p in members.values():
            yield p
Пример #12
0
    def scrape(self):
        '''
        Scrape the web to create a dict with all active organizations.
        Then, we can access the correct URL for the organization detail page.
        '''
        web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx'
        web_info = {}

        for _, organizations in web_scraper.councilMembers():
            for organization, _, _ in organizations:
                organization_name = organization['Department Name']['label'].strip()
                organization_info = organization['Department Name']

                web_info[organization_name] = organization_info

        body_types = self.body_types()

        board_of_directors, = [body for body in self.bodies()
                               if body['BodyName'] == 'Board of Directors - Regular Board Meeting']
        board_of_directors["BodyName"] = "Board of Directors"

        terms = collections.defaultdict(list)
        for office in self.body_offices(board_of_directors):
            terms[office['OfficeRecordFullName']].append(office)

        members = {}
        for member, offices in terms.items():
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']

                if role not in {'Board Member', 'non-voting member'}:
                    p.add_term(role,
                               'legislature',
                               start_date = self.toDate(term['OfficeRecordStartDate']),
                               end_date = self.toDate(term['OfficeRecordEndDate']),
                               appointment = True)
                if role != 'Chief Executive Officer':
                    if role == 'non-voting member':
                        member_type = 'Nonvoting Board Member'
                        post = NONVOTING_POSTS.get(member)
                    else:
                        member_type = 'Board Member'
                        post = VOTING_POSTS.get(member)

                    start_date = self.toDate(term['OfficeRecordStartDate'])
                    end_date = self.toDate(term['OfficeRecordEndDate'])
                    board_membership = p.add_term(member_type,
                               'legislature',
                               district = post,
                               start_date = start_date,
                               end_date = end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(p.name)
                    if acting_member_end_date and acting_member_end_date <= end_date:
                        board_membership.extras = {'acting': 'true'}

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                organization_name = body['BodyName'].strip()
                o = Organization(organization_name,
                                 classification='committee',
                                 parent_id={'name' : 'Board of Directors'})

                organization_info = web_info.get(organization_name, {})
                organization_url = organization_info.get('url', self.WEB_URL + 'https://metro.legistar.com/Departments.aspx')

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(organization_url, note='web')

                for office in self.body_offices(body):
                    role = office['OfficeRecordTitle']


                    if role not in ("Chair", "Vice Chair", "Chief Executive Officer"):
                        if role == 'non-voting member':
                            role = 'Nonvoting Member'
                        else:
                            role = 'Member'

                    person = office['OfficeRecordFullName']

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    start_date = self.toDate(office['OfficeRecordStartDate'])
                    end_date = self.toDate(office['OfficeRecordEndDate'])
                    membership = p.add_membership(organization_name,
                                     role=role,
                                     start_date=start_date,
                                     end_date=end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(p.name)
                    if acting_member_end_date and acting_member_end_date <= end_date:
                        membership.extras = {'acting': 'true'}

                yield o

        for p in members.values():
            yield p
Пример #13
0
    def scrape(self):
        '''
        Scrape the web to create a dict with all active organizations.
        Then, we can access the correct URL for the organization detail page.
        '''
        web_scraper = LegistarPersonScraper(
            requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx'
        web_info = {}

        for _, organizations in web_scraper.councilMembers():
            for organization, _, _ in organizations:
                organization_name = organization['Department Name'][
                    'label'].strip()
                organization_info = organization['Department Name']

                web_info[organization_name] = organization_info

        body_types = self.body_types()

        board_of_directors, = [
            body for body in self.bodies()
            if body['BodyName'] == 'Board of Directors - Regular Board Meeting'
        ]
        board_of_directors["BodyName"] = "Board of Directors"

        terms = collections.defaultdict(list)
        for office in self.body_offices(board_of_directors):
            terms[office['OfficeRecordFullName']].append(office)

        members = {}
        for member, offices in terms.items():
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']

                if role not in {'Board Member', 'non-voting member'}:
                    p.add_term(
                        role,
                        'legislature',
                        start_date=self.toDate(term['OfficeRecordStartDate']),
                        end_date=self.toDate(term['OfficeRecordEndDate']),
                        appointment=True)
                if role != 'Chief Executive Officer':
                    if role == 'non-voting member':
                        member_type = 'Nonvoting Board Member'
                        post = NONVOTING_POSTS.get(member)
                    else:
                        member_type = 'Board Member'
                        post = VOTING_POSTS.get(member)

                    start_date = self.toDate(term['OfficeRecordStartDate'])
                    end_date = self.toDate(term['OfficeRecordEndDate'])
                    board_membership = p.add_term(member_type,
                                                  'legislature',
                                                  district=post,
                                                  start_date=start_date,
                                                  end_date=end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(
                        p.name)
                    if acting_member_end_date and acting_member_end_date <= end_date:
                        board_membership.extras = {'acting': 'true'}

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                organization_name = body['BodyName'].strip()
                o = Organization(organization_name,
                                 classification='committee',
                                 parent_id={'name': 'Board of Directors'})

                organization_info = web_info.get(organization_name, {})
                organization_url = organization_info.get(
                    'url', self.WEB_URL +
                    'https://metro.legistar.com/Departments.aspx')

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(organization_url, note='web')

                for office in self.body_offices(body):
                    role = office['OfficeRecordTitle']

                    if role not in ("Chair", "Vice Chair",
                                    "Chief Executive Officer"):
                        if role == 'non-voting member':
                            role = 'Nonvoting Member'
                        else:
                            role = 'Member'

                    person = office['OfficeRecordFullName']

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    start_date = self.toDate(office['OfficeRecordStartDate'])
                    end_date = self.toDate(office['OfficeRecordEndDate'])
                    membership = p.add_membership(organization_name,
                                                  role=role,
                                                  start_date=start_date,
                                                  end_date=end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(
                        p.name)
                    if acting_member_end_date and acting_member_end_date <= end_date:
                        membership.extras = {'acting': 'true'}

                yield o

        for p in members.values():
            yield p