def refresh_people(jurisdiction): s = LegistarPersonScraper() MEMBERLIST = 'https://{}.legistar.com/People.aspx'.format(jurisdiction) page = next(s.pages(MEMBERLIST)) save_page(page, jurisdiction, 'people.html')
def test_parse_people(project_directory, mocker, jurisdiction): events_fixture = os.path.join(project_directory, 'tests', 'fixtures', jurisdiction, 'people.html') scraper = LegistarPersonScraper() scraper.BASE_URL = '{}.legistar.com'.format(jurisdiction) with open(events_fixture, 'r') as f: page = lxml.html.fromstring(f.read()) mocker.patch.object(scraper, 'pages', return_value=page) result = next(scraper.councilMembers(follow_links=False)) print(result)
def scrape(self): ''' Scrape the web to create a dict with all active organizations. Then, we can access the correct URL for the organization detail page. ''' web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx' web_info = {} for _, organizations in web_scraper.councilMembers(): for organization, _, _ in organizations: organization_name = organization['Department Name'][ 'label'].strip() organization_info = organization['Department Name'] web_info[organization_name] = organization_info body_types = self.body_types() board_of_directors, = [ body for body in self.bodies() if body['BodyName'] == 'Board of Directors - Regular Board Meeting' ] board_of_directors["BodyName"] = "Board of Directors" terms = collections.defaultdict(list) for office in self.body_offices(board_of_directors): terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role not in {'Board Member', 'non-voting member'}: p.add_term( role, 'legislature', start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate']), appointment=True) if role != 'Chief Executive Officer': if role == 'non-voting member': member_type = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) else: member_type = 'Board Member' post = VOTING_POSTS.get(member) start_date = self.toDate(term['OfficeRecordStartDate']) end_date = self.toDate(term['OfficeRecordEndDate']) board_membership = p.add_term(member_type, 'legislature', district=post, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: board_membership.extras = {'acting': 'true'} # Each term contains first and last names. This should be the same # across all of a person's terms, so go ahead and grab them from the # last term in the array. p.family_name = term['OfficeRecordLastName'] p.given_name = term['OfficeRecordFirstName'] # Defensively assert that the given and family names match the # expected value. if member == 'Hilda L. Solis': # Given/family name does not contain middle initial. assert p.given_name == 'Hilda' and p.family_name == 'Solis' else: assert member == ' '.join([p.given_name, p.family_name]) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] in ( body_types['Committee'], body_types['Independent Taxpayer Oversight Committee']): organization_name = body['BodyName'].strip() o = Organization(organization_name, classification='committee', parent_id={'name': 'Board of Directors'}) organization_info = web_info.get(organization_name, {}) organization_url = organization_info.get( 'url', self.WEB_URL + 'https://metro.legistar.com/Departments.aspx') o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(organization_url, note='web') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in BOARD_OFFICE_ROLES: if role == 'non-voting member': role = 'Nonvoting Member' else: role = 'Member' person = office['OfficeRecordFullName'] if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p start_date = self.toDate(office['OfficeRecordStartDate']) end_date = self.toDate(office['OfficeRecordEndDate']) membership = p.add_membership(organization_name, role=role, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: membership.extras = {'acting': 'true'} yield o for p in members.values(): yield p
def scrape(self): body_types = self.body_types() city_council, = [ body for body in self.bodies() if body['BodyName'] == 'COMMON COUNCIL' ] terms = collections.defaultdict(list) for office in self.body_offices(city_council): terms[office['OfficeRecordFullName'].strip()].append(office) web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'https://milwaukee.legistar.com/DepartmentDetail.aspx?ID=1998&GUID=74273156-5389-46F3-9D09-3D850BDE32A1' #web_scraper.ALL_MEMBERS = '3:3' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers( {'ctl00$ContentPlaceHolder$lstName': 'COMMON COUNCIL'}): web_info[member['Person Name']['label']] = member members = {} for member, offices in terms.items(): web = web_info[member] p = Person(member) for term in offices: role = term['OfficeRecordTitle'] district = re.search('(?<=(/{1}district))[\w]+', web['Website']['url']).group(0) p.add_term('Alderman', 'legislature', district="District {}".format(int(district)), start_date=self.toDate( term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Address": ("address", "City Hall Address"), "City Hall Phone": ("voice", "City Hall Phone"), "District Office Phone": ("voice", "District Office Phone"), "District Office Address": ("address", "District Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contat_ctypes.items(): if web[contact_type] and web[contact_type] != 'N/A': p.add_contact_detail(type=type_, value=web[contact_type], note=_note) if web['E-mail'] and web['E-mail'][ 'label'] and web['E-mail']['label'] != 'N/A': p.add_contact_detail(type='email', value=web['E-mail']['label'], note='E-mail') if web["Website"]: p.add_link(web["Website"]['url']) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web')
def scrape(self): body_types = self.body_types() city_council, = [ body for body in self.bodies() if body['BodyName'] == 'City Council' ] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if 'VACAN' not in office['OfficeRecordFullName']: terms[office['OfficeRecordFullName'].strip()].append(office) web_scraper = LegistarPersonScraper(None, None) web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881' web_scraper.ALL_MEMBERS = '3:3' web_info = {} for member, _ in web_scraper.councilMembers( {'ctl00$ContentPlaceHolder$lstName': 'City Council'}): web_info[member['Person Name']['label']] = member web_info['Balcer, James'] = collections.defaultdict(lambda: None) web_info['Fioretti, Bob'] = collections.defaultdict(lambda: None) web_info['Balcer, James']['Ward/Office'] = 11 web_info['Fioretti, Bob']['Ward/Office'] = 2 members = {} for member, offices in terms.items(): web = web_info[member] p = Person(member) for term in offices: role = term['OfficeRecordTitle'] p.add_term('Alderman', 'legislature', district="Ward {}".format(int(web['Ward/Office'])), start_date=self.toDate( term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Address": ("address", "City Hall Address"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web[contact_type] and web[contact_type] != 'N/A': p.add_contact_detail(type=type_, value=web[contact_type], note=_note) if web["E-mail"] and web["E-mail"][ "label"] and web["E-mail"]["label"] != 'N/A': p.add_contact_detail(type="email", value=web['E-mail']['label'], note='E-mail') if web['Website']: p.add_link(web['Website']['url']) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): # messed up record for joanna thompson if office['OfficeRecordId'] == 1055: continue role = office['OfficeRecordTitle'] if role not in ("Vice Chair", "Chairman"): role = 'Member' person = office['OfficeRecordFullName'].strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(body['BodyName'], role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for body in self.bodies(): if body['BodyTypeId'] == body_types['Joint Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') yield o for p in members.values(): yield p
def scrape(self): web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute) web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers(): name = member['Person Name']['label'].strip() web_info[name] = member city_council, = [body for body in self.bodies() if body['BodyName'] == 'City Council'] terms = collections.defaultdict(list) public_advocates = { # Match casing to Bill De Blasio as council member 'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio', 'The Public Advocate (Ms. James)': 'Letitia James', } for office in self.body_offices(city_council): name = office['OfficeRecordFullName'] name = public_advocates.get(name, name).strip() terms[name].append(office) # Add past members (and advocates public) if name not in web_info: web_info[name] = collections.defaultdict(lambda: None) # Check that we have everyone we expect, formatted consistently, in # both information arrays. For instance, this will fail if we forget to # strip trailing spaces from names on one side or the other (which has # the effect of omitting information, such as post, from the scrape). assert set(web_info.keys()) == set(terms.keys()) members = {} for member, offices in terms.items(): p = Person(member) web = web_info[member] for term in offices: role = term['OfficeRecordTitle'] if role == 'Public Advocate': role = 'Non-Voting Council Member' else: role = 'Council Member' district = web.get('District', '').replace(' 0', ' ') p.add_term(role, 'legislature', district=district, start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) party = web.get('Political Party') if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web.get(contact_type) and web(contact_type) != 'N/A': p.add_contact_detail(type=type_, value= web[contact_type], note=_note) if web.get('E-mail'): p.add_contact_detail(type="email", value=web['E-mail']['url'], note='E-mail') if web.get('Web site'): p.add_link(web['Web site']['url'], note='web site') if web.get('Notes'): p.extras = {'Notes': web['Notes']} if not p.sources: # Only add sources once source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p committee_types = ['Committee', 'Inactive Committee', 'Select Committee', 'Subcommittee', 'Task Force', 'Land Use', # Committee on Land Use ] body_types = {k: v for k, v in self.body_types().items() if k in committee_types} for body in self.bodies(): if body['BodyTypeName'] in body_types \ or body['BodyName'] in ('Legislative Documents Unit', 'Legal and Government Affairs Division'): # Skip typo in API data if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services': continue parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council') body_name = body['BodyName'] o = Organization(body_name, classification='committee', parent_id={'name': parent_org}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') for office in self.body_offices(body): # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio', # 'Committee Member', None, 'CHAIRPERSON' role = office['OfficeRecordTitle'] if role and role.lower() == 'chairperson': role = 'Chairperson' else: role = 'Member' person = office['OfficeRecordFullName'] person = public_advocates.get(person, person).strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(o, role=role, start_date=self.toDate(office['OfficeRecordStartDate']), end_date=self.toDate(office['OfficeRecordEndDate'])) yield o for p in members.values(): yield p
def scrape(self): web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers(): name = member['Person Name']['label'].strip() web_info[name] = member city_council, = [ body for body in self.bodies() if body['BodyName'] == 'City Council' ] terms = collections.defaultdict(list) public_advocates = { # Match casing to Bill De Blasio as council member 'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio', 'The Public Advocate (Ms. James)': 'Letitia James', } for office in self.body_offices(city_council): name = office['OfficeRecordFullName'] name = public_advocates.get(name, name).strip() terms[name].append(office) # Add past members (and advocates public) if name not in web_info: web_info[name] = collections.defaultdict(lambda: None) # Check that we have everyone we expect, formatted consistently, in # both information arrays. For instance, this will fail if we forget to # strip trailing spaces from names on one side or the other (which has # the effect of omitting information, such as post, from the scrape). assert set(web_info.keys()) == set(terms.keys()) members = {} for member, offices in terms.items(): p = Person(member) web = web_info[member] for term in offices: role = term['OfficeRecordTitle'] if role == 'Public Advocate': role = 'Non-Voting Council Member' else: role = 'Council Member' district = web.get('District', '').replace(' 0', ' ') p.add_term(role, 'legislature', district=district, start_date=self.toDate( term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) party = web.get('Political Party') if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web.get(contact_type) and web(contact_type) != 'N/A': p.add_contact_detail(type=type_, value=web[contact_type], note=_note) if web.get('E-mail'): p.add_contact_detail(type="email", value=web['E-mail']['url'], note='E-mail') if web.get('Web site'): p.add_link(web['Web site']['url'], note='web site') if web.get('Notes'): p.extras = {'Notes': web['Notes']} if not p.sources: # Only add sources once source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p committee_types = [ 'Committee', 'Inactive Committee', 'Select Committee', 'Subcommittee', 'Task Force', 'Land Use' ] # Committee on Land Use body_types = { k: v for k, v in self.body_types().items() if k in committee_types } for body in self.bodies(): if body['BodyTypeName'] in body_types \ or body['BodyName'] in ('Legislative Documents Unit', 'Legal and Government Affairs Division'): # Skip typo in API data if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services': continue parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council') body_name = body['BodyName'] o = Organization(body_name, classification='committee', parent_id={'name': parent_org}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio', # 'Committee Member', None, 'CHAIRPERSON' role = office['OfficeRecordTitle'] if role and role.lower() == 'chairperson': role = 'Chairperson' else: role = 'Member' person = office['OfficeRecordFullName'] person = public_advocates.get(person, person).strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(o, role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for p in members.values(): yield p
def scrape(self): body_types = self.body_types() city_council, = [body for body in self.bodies() if body["BodyName"] == "City Council"] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if "VACAN" not in office["OfficeRecordFullName"]: terms[office["OfficeRecordFullName"].strip()].append(office) web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = "https://pittsburgh.legistar.com/People.aspx" web_scraper.COMMITTEELIST = "https://pittsburgh.legistar.com/Departments.aspx" if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member in web_scraper.councilMembers(): web_info[member["Person Name"]] = member members = {} for member, offices in terms.items(): person = Person(member) for term in offices: role = term["OfficeRecordTitle"] person.add_term("Councilmember", "legislature", start_date = self.toDate(term["OfficeRecordStartDate"]), end_date = self.toDate(term["OfficeRecordEndDate"])) if member in web_info: web = web_info[member] if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != "N/A": person.add_contact_detail(type="email", value=web["E-mail"]["label"], note="E-mail") person_source_data = self.person_sources_from_office(term) person_api_url, person_api_response = person_source_data person.add_source(person_api_url, note="api") if person_api_response["PersonAddress1"]: address = (person_api_response["PersonAddress1"] + ", " + person_api_response["PersonCity1"] + ", " + person_api_response["PersonState1"] + " " + person_api_response["PersonZip1"]) person.add_contact_detail(type="address", value=address, note="Office address") if person_api_response["PersonPhone"]: person.add_contact_detail(type="voice", value=person_api_response["PersonPhone"], note="Office phone") if person_api_response["PersonWWW"]: person.add_contact_detail(type="url", value=person_api_response["PersonWWW"], note="District website") members[member] = person for body in self.bodies(): if body["BodyTypeId"] == body_types["Committee"]: body_name_clean = body["BodyName"].strip() organization = Organization(body_name_clean, classification="committee", parent_id={"name" : "Pittsburgh City Council"}) organization.add_source(self.BASE_URL + "/bodies/{BodyId}".format(**body), note="api") for office in self.body_offices(body): role = office["OfficeRecordMemberType"] if role not in ("Vice Chair", "Chair") or role == "Councilmember": role = "Member" person = office["OfficeRecordFullName"].strip() if person in members: person = members[person] else: person = Person(person) person.add_membership(body_name_clean, role=role, start_date = self.toDate(office["OfficeRecordStartDate"]), end_date = self.toDate(office["OfficeRecordEndDate"])) yield organization for person in members.values(): yield person
def scrape(self): body_types = self.body_types() city_council, = [body for body in self.bodies() if body['BodyName'] == 'City Council'] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if 'vacan' not in office['OfficeRecordFullName'].lower(): terms[office['OfficeRecordFullName'].strip()].append(office) web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute) web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881' web_scraper.ALL_MEMBERS = '3:3' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers({'ctl00$ContentPlaceHolder$lstName' : 'City Council'}): web_info[member['Person Name']['label']] = member web_info['Balcer, James'] = collections.defaultdict(lambda : None) web_info['Fioretti, Bob'] = collections.defaultdict(lambda : None) web_info['Balcer, James']['Ward/Office'] = 11 web_info['Fioretti, Bob']['Ward/Office'] = 2 members = {} for member, offices in terms.items(): web = web_info[member] p = Person(member) for term in offices: role = term['OfficeRecordTitle'] p.add_term('Alderman', 'legislature', district = "Ward {}".format(int(web['Ward/Office'])), start_date = self.toDate(term['OfficeRecordStartDate']), end_date = self.toDate(term['OfficeRecordEndDate'])) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Address": ("address", "City Hall Address"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web[contact_type] and web[contact_type] != 'N/A': p.add_contact_detail(type=type_, value= web[contact_type], note=_note) if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != 'N/A': p.add_contact_detail(type="email", value=web['E-mail']['label'], note='E-mail') if web['Website']: p.add_link(web['Website']['url']) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name' : 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') for office in self.body_offices(body): # messed up record for joanna thompson if office['OfficeRecordId'] == 1055: continue role = office['OfficeRecordTitle'] if role not in ("Vice Chair", "Chairman"): role = 'Member' person = office['OfficeRecordFullName'].strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p try: end_date = self.toDate(office['OfficeRecordEndDate']) except TypeError: end_date = '' p.add_membership(body['BodyName'], role=role, start_date=self.toDate(office['OfficeRecordStartDate']), end_date=end_date) yield o for body in self.bodies(): if body['BodyTypeId'] == body_types['Joint Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name' : 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') yield o for p in members.values(): yield p
def scrape(self): ''' Scrape the web to create a dict with all active organizations. Then, we can access the correct URL for the organization detail page. ''' web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx' web_info = {} for _, organizations in web_scraper.councilMembers(): for organization, _, _ in organizations: organization_name = organization['Department Name']['label'].strip() organization_info = organization['Department Name'] web_info[organization_name] = organization_info body_types = self.body_types() board_of_directors, = [body for body in self.bodies() if body['BodyName'] == 'Board of Directors - Regular Board Meeting'] board_of_directors["BodyName"] = "Board of Directors" terms = collections.defaultdict(list) for office in self.body_offices(board_of_directors): terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role not in {'Board Member', 'non-voting member'}: p.add_term(role, 'legislature', start_date = self.toDate(term['OfficeRecordStartDate']), end_date = self.toDate(term['OfficeRecordEndDate']), appointment = True) if role != 'Chief Executive Officer': if role == 'non-voting member': member_type = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) else: member_type = 'Board Member' post = VOTING_POSTS.get(member) start_date = self.toDate(term['OfficeRecordStartDate']) end_date = self.toDate(term['OfficeRecordEndDate']) board_membership = p.add_term(member_type, 'legislature', district = post, start_date = start_date, end_date = end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(p.name) if acting_member_end_date and acting_member_end_date <= end_date: board_membership.extras = {'acting': 'true'} source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: organization_name = body['BodyName'].strip() o = Organization(organization_name, classification='committee', parent_id={'name' : 'Board of Directors'}) organization_info = web_info.get(organization_name, {}) organization_url = organization_info.get('url', self.WEB_URL + 'https://metro.legistar.com/Departments.aspx') o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(organization_url, note='web') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in ("Chair", "Vice Chair", "Chief Executive Officer"): if role == 'non-voting member': role = 'Nonvoting Member' else: role = 'Member' person = office['OfficeRecordFullName'] if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p start_date = self.toDate(office['OfficeRecordStartDate']) end_date = self.toDate(office['OfficeRecordEndDate']) membership = p.add_membership(organization_name, role=role, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(p.name) if acting_member_end_date and acting_member_end_date <= end_date: membership.extras = {'acting': 'true'} yield o for p in members.values(): yield p
def scrape(self): ''' Scrape the web to create a dict with all active organizations. Then, we can access the correct URL for the organization detail page. ''' web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx' web_info = {} for _, organizations in web_scraper.councilMembers(): for organization, _, _ in organizations: organization_name = organization['Department Name'][ 'label'].strip() organization_info = organization['Department Name'] web_info[organization_name] = organization_info body_types = self.body_types() board_of_directors, = [ body for body in self.bodies() if body['BodyName'] == 'Board of Directors - Regular Board Meeting' ] board_of_directors["BodyName"] = "Board of Directors" terms = collections.defaultdict(list) for office in self.body_offices(board_of_directors): terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role not in {'Board Member', 'non-voting member'}: p.add_term( role, 'legislature', start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate']), appointment=True) if role != 'Chief Executive Officer': if role == 'non-voting member': member_type = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) else: member_type = 'Board Member' post = VOTING_POSTS.get(member) start_date = self.toDate(term['OfficeRecordStartDate']) end_date = self.toDate(term['OfficeRecordEndDate']) board_membership = p.add_term(member_type, 'legislature', district=post, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: board_membership.extras = {'acting': 'true'} source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: organization_name = body['BodyName'].strip() o = Organization(organization_name, classification='committee', parent_id={'name': 'Board of Directors'}) organization_info = web_info.get(organization_name, {}) organization_url = organization_info.get( 'url', self.WEB_URL + 'https://metro.legistar.com/Departments.aspx') o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(organization_url, note='web') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in ("Chair", "Vice Chair", "Chief Executive Officer"): if role == 'non-voting member': role = 'Nonvoting Member' else: role = 'Member' person = office['OfficeRecordFullName'] if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p start_date = self.toDate(office['OfficeRecordStartDate']) end_date = self.toDate(office['OfficeRecordEndDate']) membership = p.add_membership(organization_name, role=role, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: membership.extras = {'acting': 'true'} yield o for p in members.values(): yield p