def scrape(self): committee_d = {} for councilman, committees in self.councilMembers() : p = Person(' '.join((councilman['First name'], councilman['Last name']))) if p.name == 'Toni Preckwinkle' : continue elif p.name == 'Robert Steele' : district = 2 elif p.name == 'Jerry Butler' : district = 3 elif p.name == 'Sean Morrison' : district = 17 else : district = re.findall('\d+', councilman['Person Name']['url'])[0] start_date = self.toTime(councilman['Start Date']).date() end_date = self.toTime(councilman['End Date']).date() if end_date == datetime.date(2018, 12, 2) : end_date = '' else : end_date = end_date.isoformat() p.add_term('Commissioner', 'legislature', district='District {}'.format(district), start_date=start_date.isoformat(), end_date=end_date) if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['url'], note='E-mail') if councilman['Web site']: p.add_link(councilman['Web site']['url'], note='web site') p.add_source(councilman['Person Name']['url']) for committee, _, _ in committees: committee_name = committee['Department Name']['label'] if 'committee' in committee_name.lower() : o = committee_d.get(committee_name, None) if o is None: o = Organization(committee_name, classification='committee', parent_id={'name' : 'Cook County Board of Commissioners'}) o.add_source(committee['Department Name']['url']) committee_d[committee_name] = o membership = o.add_member(p, role=committee["Title"]) membership.start_date = self.mdY2Ymd(committee["Start Date"]) yield p for o in committee_d.values() : yield o
def test_person_add_term(): p = Person('Eternal') p.add_term('eternal', 'council', start_date='0001', end_date='9999') p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { 'classification': 'council', } assert p._related[0].start_date == '0001' assert p._related[0].end_date == '9999'
def scrape(self): current_path = Path(__file__) legislator_path = current_path.parent / 'congress-legislators/legislators-historical.yaml' with legislator_path.open() as f: legislators = yaml.load(f, Loader=yaml.CLoader) for legislator in legislators: if all(term['end'] < '1970' for term in legislator['terms']): continue l = Person(name=' '.join( (legislator['name']['first'], legislator['name']['last'])), birth_date=legislator['bio'].get('birthday', ''), gender=legislator['bio']['gender']) parties = set() for term in legislator['terms']: state = term['state'] parties.add(term['party']) if term['type'] == 'rep': role = 'Representative' district_name = self._district_name( state, term['district']) chamber = 'lower' else: role = "Senator" district_name = "{state}, Class {klass}".format( state=state, klass=term['class']) chamber = 'upper' l.add_term(role, chamber, district=district_name, start_date=term['start'], end_date=term['end']) for party in parties: l.add_party(party) for scheme, identifier in legislator['id'].items(): l.add_identifier(str(identifier), scheme=scheme) l.add_source( 'https://github.com/unitedstates/congress-legislators/blob/master/legislators-historical.yaml' ) yield l
def get_organizations(self): #REQUIRED: define an organization using this format #where org_name is something like Seattle City Council #and classification is described here: org = Organization(name="Mountain View City Council", classification="legislature") # REQUIRED: yield the organization yield org # OPTIONAL: add posts to your organizaion using this format, # where label is a human-readable description of the post (eg "Ward 8 councilmember") # and role is the position type (eg councilmember, alderman, mayor...) # skip entirely if you're not writing a people scraper. city = Organization('City of Mountain View', classification='executive') city.add_post( 'Mayor', 'Mayor', division_id='ocd-division/country:us/state:ca/place:mountainview') city.add_post( 'City Manager', 'City Manager', division_id='ocd-division/country:us/state:ca/place:mountainview') city.add_post( 'City Clerk', 'City Clerk', division_id='ocd-division/country:us/state:ca/place:mountainview') yield city abekoga = Person(name="Abe-Koga, Margaret") abekoga.add_term('Mayor', 'executive', start_date=datetime.date(1989, 4, 24), end_date=datetime.date(2011, 5, 16), appointment=True) abekoga.add_source('https://mountainview.legistar.com/People.aspx') yield abekoga
def scrape(self): council = requests.get( 'https://www.stpaul.gov/departments/city-council') base = html.fromstring(council.text) base.make_links_absolute( 'https://www.stpaul.gov/departments/city-council') links = base.xpath('.//*[@class="field-item even"]/p/a/@href') links = list(set(links)) links = [l for l in links if 'ward' in l] names = [] for link in links: person = {} root = requests.get(link) base = html.fromstring(root.text) block = base.xpath( './/*[@class="well well--blue well--big-padding block-content"]' )[0] ps = block.xpath('.//p') ps = [p for p in ps if len(p.xpath('.//*')) > 0] name = block.xpath('.//p/a/text()')[0].split(' ') title = base.xpath('.//*[@id="page-title"]/text()')[0] if len(name) == 3: name.pop(1) name = (' ').join(name) if not name in names: names.append(name) person['name'] = name person['ward'] = title.split('-')[0].strip() person['role'] = title.split('-')[1].split(' ')[0].strip() member = Person(name=person['name'], role=person['role']) member.add_source(link) member.add_term(person['role'], 'legislature', org_name='Saint Paul City Council', district=person['ward']) yield member
def get_organizations(self): #REQUIRED: define an organization using this format #where org_name is something like Seattle City Council #and classification is described here: org = Organization(name="Common Council", classification="legislature") for x in range(1, 16): org.add_post( "District {}".format(x), "Alderman", division_id= 'ocd-division/country:us/state:wi/place:milwaukee/council_district:{}' .format(x)) # OPTIONAL: add posts to your organizaion using this format, # where label is a human-readable description of the post (eg "Ward 8 councilmember") # and role is the position type (eg councilmember, alderman, mayor...) # skip entirely if you're not writing a people scraper. #org.add_post(label="position_description", role="position_type") #REQUIRED: yield the organization yield org city = Organization("City of Milwaukee", classification='executive') city.add_post( 'Mayor', 'Mayor', division_id='ocd-division/country:us/state:wi/place:milwaukee') yield city barrett = Person(name="Barrett, Tom") barrett.add_term('Mayor', 'executive', start_date=datetime.date(2004, 4, 15), appointment=True) barrett.add_source('https://milwaukee.legistar.com/People.aspx') yield barrett
def scrape(self): body_types = self.body_types() board_of_directors, = [ body for body in self.bodies() if body['BodyName'] == 'Board of Directors' ] members = {} for office in self.body_offices(board_of_directors): members.setdefault(office['OfficeRecordFullName'], []).append(office) for member, offices in members.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role != 'non-voting member': role = 'Board Member' post = VOTING_POSTS.get(member) else: role = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) p.add_term(role, 'legislature', district=post, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate(office['OfficeRecordEndDate'])) legistar_api = self.BASE_URL + '/OfficeRecords/' p.add_source(legistar_api, note='api') print(p) yield p adjunct_members = {} for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Board of Directors'}) o.add_source(self.BASE_URL + '/Bodies/') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in ("Chair", "Vice Chair"): role = 'Member' person = office['OfficeRecordFullName'] if person not in members: if person not in adjunct_members: p = Person(person) p.add_source('foo') else: p = adjunct_members[person] p.add_membership(body['BodyName'], role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) adjunct_members[person] = p else: o.add_member(office['OfficeRecordFullName'], role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for p in adjunct_members.values(): yield p
def scrape(self): web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers(): name = member['Person Name']['label'].strip() web_info[name] = member city_council, = [ body for body in self.bodies() if body['BodyName'] == 'City Council' ] terms = collections.defaultdict(list) public_advocates = { # Match casing to Bill De Blasio as council member 'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio', 'The Public Advocate (Ms. James)': 'Letitia James', } for office in self.body_offices(city_council): name = office['OfficeRecordFullName'] name = public_advocates.get(name, name).strip() terms[name].append(office) # Add past members (and advocates public) if name not in web_info: web_info[name] = collections.defaultdict(lambda: None) # Check that we have everyone we expect, formatted consistently, in # both information arrays. For instance, this will fail if we forget to # strip trailing spaces from names on one side or the other (which has # the effect of omitting information, such as post, from the scrape). assert set(web_info.keys()) == set(terms.keys()) members = {} for member, offices in terms.items(): p = Person(member) web = web_info[member] for term in offices: role = term['OfficeRecordTitle'] if role == 'Public Advocate': role = 'Non-Voting Council Member' else: role = 'Council Member' district = web.get('District', '').replace(' 0', ' ') p.add_term(role, 'legislature', district=district, start_date=self.toDate( term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) party = web.get('Political Party') if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web.get(contact_type) and web(contact_type) != 'N/A': p.add_contact_detail(type=type_, value=web[contact_type], note=_note) if web.get('E-mail'): p.add_contact_detail(type="email", value=web['E-mail']['url'], note='E-mail') if web.get('Web site'): p.add_link(web['Web site']['url'], note='web site') if web.get('Notes'): p.extras = {'Notes': web['Notes']} if not p.sources: # Only add sources once source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p committee_types = [ 'Committee', 'Inactive Committee', 'Select Committee', 'Subcommittee', 'Task Force', 'Land Use' ] # Committee on Land Use body_types = { k: v for k, v in self.body_types().items() if k in committee_types } for body in self.bodies(): if body['BodyTypeName'] in body_types \ or body['BodyName'] in ('Legislative Documents Unit', 'Legal and Government Affairs Division'): # Skip typo in API data if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services': continue parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council') body_name = body['BodyName'] o = Organization(body_name, classification='committee', parent_id={'name': parent_org}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio', # 'Committee Member', None, 'CHAIRPERSON' role = office['OfficeRecordTitle'] if role and role.lower() == 'chairperson': role = 'Chairperson' else: role = 'Member' person = office['OfficeRecordFullName'] person = public_advocates.get(person, person).strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(o, role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for p in members.values(): yield p
def get_organizations(self): global date_range city = Organization('City of Saint Paul', classification='executive') city.add_post( 'Mayor', 'Mayor', division_id='ocd-division/country:us/state:mn/place:st_paul') city.add_post( 'City Clerk', 'City Clerk', division_id='ocd-division/country:us/state:mn/place:st_paul') yield city council = Organization(name="Saint Paul City Council", classification="legislature", parent_id=city) for x in range(1, 8): council.add_post( "Ward {}".format(x), "Councilmember", division_id= 'ocd-division/country:us/state:mn/place:st_paul/ward:{}'. format(x)) yield council carter = Person(name="Melvin Carter") carter.add_term('Mayor', 'executive', start_date=dtdate(2018, 1, 19), appointment=True) carter.add_source('http://www.google.com') yield carter new_meetings = [] temp_labels = [] for date in date_range: print('Checking date:', date) root = requests.get("https://www.stpaul.gov/calendar/" + date) base = html.fromstring(root.text) items = base.xpath('.//*/div[@class="view-content"]/div') meetings = [] for i in items: if len( i.xpath( './/*/span[@class="date-display-single"]/text()') ) > 0: d = {} d['date'] = i.xpath( './/*/span[@class="date-display-single"]/text()')[0] d['info'] = i.xpath( './/*/span[@class="field-content"]/a/text()')[0] d['link'] = i.xpath( './/*/span[@class="field-content"]/a/@href')[0] meetings.append(d) for m in meetings: m['link'] = "https://www.stpaul.gov" + m['link'] for m in meetings: r = requests.get(m['link']) b = html.fromstring(r.text) exists = b.xpath('.//div[@class="node-content clearfix"]') if len(exists) > 0: if not 'City Council' in m[ 'info'] and not 'Legislative' in m[ 'info'] and not 'Holiday' in m['info']: m['name'] = m['info'].replace('Meeting', '').replace( ' - Cancelled', '').replace('Events', '').strip() if not m['name'] in temp_labels: temp_labels.append(m['name']) new_meetings.append(m) print('Creating organizations') for m in new_meetings: print(m) cmt = Organization(name=m['name'], classification='committee', parent_id=city) cmt.add_source(m['link']) yield cmt
def scrape(self): ''' Scrape the web to create a dict with all active organizations. Then, we can access the correct URL for the organization detail page. ''' web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx' web_info = {} for _, organizations in web_scraper.councilMembers(): for organization, _, _ in organizations: organization_name = organization['Department Name'][ 'label'].strip() organization_info = organization['Department Name'] web_info[organization_name] = organization_info body_types = self.body_types() board_of_directors, = [ body for body in self.bodies() if body['BodyName'] == 'Board of Directors - Regular Board Meeting' ] board_of_directors["BodyName"] = "Board of Directors" terms = collections.defaultdict(list) for office in self.body_offices(board_of_directors): terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role not in {'Board Member', 'non-voting member'}: p.add_term( role, 'legislature', start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate']), appointment=True) if role != 'Chief Executive Officer': if role == 'non-voting member': member_type = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) else: member_type = 'Board Member' post = VOTING_POSTS.get(member) start_date = self.toDate(term['OfficeRecordStartDate']) end_date = self.toDate(term['OfficeRecordEndDate']) board_membership = p.add_term(member_type, 'legislature', district=post, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: board_membership.extras = {'acting': 'true'} source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: organization_name = body['BodyName'].strip() o = Organization(organization_name, classification='committee', parent_id={'name': 'Board of Directors'}) organization_info = web_info.get(organization_name, {}) organization_url = organization_info.get( 'url', self.WEB_URL + 'https://metro.legistar.com/Departments.aspx') o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(organization_url, note='web') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in ("Chair", "Vice Chair", "Chief Executive Officer"): if role == 'non-voting member': role = 'Nonvoting Member' else: role = 'Member' person = office['OfficeRecordFullName'] if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p start_date = self.toDate(office['OfficeRecordStartDate']) end_date = self.toDate(office['OfficeRecordEndDate']) membership = p.add_membership(organization_name, role=role, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: membership.extras = {'acting': 'true'} yield o for p in members.values(): yield p
def scrape(self): noncommittees = {'Committee of the Whole'} committee_d = {} people_d = {} for councilman, committees in self.councilMembers() : if 'url' in councilman['Person Name'] : councilman_url = councilman['Person Name']['url'] if councilman_url in people_d : people_d[councilman_url][0].append(councilman) else : people_d[councilman_url] = [councilman], committees for person_entries, committees in people_d.values() : councilman = person_entries[-1] p = Person(councilman['Person Name']['label']) if p.name == 'Letitia James' : p.name = 'Letitia Ms. James' p.add_name('Letitia James') spans = [(self.toTime(entry['Start Date']).date(), self.toTime(entry['End Date']).date(), entry['District']) for entry in person_entries] merged_spans = [] last_end_date = None last_district = None for start_date, end_date, district in sorted(spans) : if last_end_date is None : span = [start_date, end_date, district] elif (start_date - last_end_date) == datetime.timedelta(1) and district == last_district : span[1] = end_date else : merged_spans.append(span) span = [start_date, end_date, district] last_end_date = end_date last_district = district merged_spans.append(span) for start_date, end_date, district in merged_spans : district = councilman['District'].replace(' 0', ' ') if end_date == datetime.date(2017, 12, 31) : end_date = '' else : end_date = end_date.isoformat() print(start_date, end_date) p.add_term('Council Member', 'legislature', district=district, start_date=start_date.isoformat(), end_date=end_date) party = councilman['Political Party'] if party == 'Democrat' : party = 'Democratic' if party : p.add_party(party) if councilman['Photo'] : p.image = councilman['Photo'] if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['url'], note='E-mail') if councilman['Web site']: p.add_link(councilman['Web site']['url'], note='web site') p.extras = {'Notes' : councilman['Notes']} p.add_source(councilman['Person Name']['url'], note='web') for committee, _, _ in committees: committee_name = committee['Department Name']['label'] if committee_name not in noncommittees and 'committee' in committee_name.lower(): o = committee_d.get(committee_name, None) if o is None: parent_id = PARENT_ORGS.get(committee_name, 'New York City Council') o = Organization(committee_name, classification='committee', parent_id={'name' : parent_id}) o.add_source(committee['Department Name']['url']) committee_d[committee_name] = o membership = o.add_member(p, role=committee["Title"]) membership.start_date = self.mdY2Ymd(committee["Start Date"]) yield p for o in committee_d.values() : if 'Committee' in o.name : yield o for o in committee_d.values() : if 'Subcommittee' in o.name : yield o o = Organization('Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services', classification='committee', parent_id={'name' : 'New York City Council'}) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o o = Organization('Subcommittee on Drug Abuse', classification='committee', parent_id={'name' : 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services'}) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o
def get_organizations(self): org = Organization(name="Chicago City Council", classification="legislature") for x in range(1, 51): org.add_post( "Ward {}".format(x), "Alderman", division_id='ocd-division/country:us/state:il/place:chicago/ward:{}'.format(x)) yield org city = Organization('City of Chicago', classification='executive') city.add_post('Mayor', 'Mayor', division_id='ocd-division/country:us/state:il/place:chicago') city.add_post('City Clerk', 'City Clerk', division_id='ocd-division/country:us/state:il/place:chicago') yield city daley = Person(name="Daley, Richard M.") daley.add_term('Mayor', 'executive', start_date=datetime.date(1989, 4, 24), end_date=datetime.date(2011, 5, 16), appointment=True) daley.add_source('https://chicago.legistar.com/People.aspx') yield daley emanuel = Person(name="Emanuel, Rahm") emanuel.add_term('Mayor', 'executive', start_date=datetime.date(2011, 5, 16), appointment=True) emanuel.add_source('https://chicago.legistar.com/People.aspx') yield emanuel mendoza = Person(name='Mendoza, Susana A.') mendoza.add_term('City Clerk', 'executive', start_date=datetime.date(2011, 5, 16), end_date=datetime.date(2016, 12, 4), appointment=True) mendoza.add_source('https://chicago.legistar.com/People.aspx') yield mendoza valle = Person(name='Del Valle, Miguel') valle.add_term('City Clerk', 'executive', start_date=datetime.date(2006, 12, 1), end_date=datetime.date(2011, 5, 16), appointment=True) valle.add_source('https://chicago.legistar.com/People.aspx') yield valle valencia = Person(name='Valencia, Anna M.') valencia.add_term(role='City Clerk', org_classification='executive', start_date=datetime.date(2017, 1, 25), end_date=datetime.date(2019, 5, 20), appointment=True) valencia.add_source('https://chicago.legistar.com/People.aspx') yield valencia
def scrape(self): body_types = self.body_types() city_council, = [body for body in self.bodies() if body['BodyName'] == 'City Council '] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if office['OfficeRecordFullName'] != "Granicus BA": terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] p.add_term(role, 'legislature', # district = "District {}".format(int(web['District/Office'])), start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Standing Committees']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Sacramento City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') for office in self.body_offices(body): # messed up record for joanna thompson if office['OfficeRecordId'] == 1055: continue role = office['OfficeRecordTitle'] if role not in ("Vice Chair", "Chairperson"): role = 'Member' person = office['OfficeRecordFullName'].strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(body['BodyName'], role=role, start_date=self.toDate(office['OfficeRecordStartDate']), end_date=self.toDate(office['OfficeRecordEndDate'])) yield o for body in self.bodies(): if body['BodyTypeId'] == body_types['Boards or Commission']: o = Organization(body['BodyName'], classification='commission', parent_id={'name': 'Sacramento City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') yield o for p in members.values(): yield p
def scrape(self): # School Board school = 'http://board.mpls.k12.mn.us/' body = requests.get(school) base = html.fromstring(body.text) base.make_links_absolute(school) members = base.xpath('.//*/div[@class="summary"]') members = list(set(members)) board = [] for member in members: b = {} b['term'] = member.xpath('.//span/p/text()')[-1].replace( '\r\n\t', '').replace('\xa0', '').replace('|', '').strip() b['district'] = member.xpath('.//*/a/text()')[-1] link = member.xpath('.//*/@href')[0] member_base = requests.get(link) member_base = html.fromstring(member_base.text) member_base.make_links_absolute(school) text = member_base.xpath('.//*/span/p/text()') text = [mb.strip() for mb in text] text = [t for t in text if len(t) > 0] # print(text, '\n\n+++\n\n') b['role'] = text[0].split(',')[1] b['email'] = member_base.xpath('.//*/span/p/a/@href')[0] b['name'] = member_base.xpath('.//*/div/span/text()')[1] try: b['headshot'] = member_base.xpath('.//*/div/a/@href')[1] except: pass member = Person(name=b['name'], role=b['role']) member.add_source(url=school) member.add_term('Director', 'legislature', org_name='Minneapolis School Board', district=b['district']) yield member # City Council council = 'http://www.minneapolismn.gov/council/' body = requests.get(council) base = html.fromstring(body.text) base.make_links_absolute(council) wards = base.xpath('.//*/ul[@id="cname"]/li') for w in wards: i = {} link = w.xpath('.//a/@href')[0] text = w.xpath('.//a/text()')[0] i['link'] = link i['ward'] = text.split('-')[0].strip() i['name'] = text.split('-')[1].strip() member = Person(name=i['name'], role='Council Member') member.add_source(link) member.add_term('Councilmember', 'legislature', org_name='Minneapolis City Council', district=i['ward']) yield member # Park and Rec Board parks = 'https://www.minneapolisparks.org/about_us/leadership_and_structure/commissioners/' body = requests.get(parks) base = html.fromstring(body.text) base.make_links_absolute(parks) member_base = base.xpath('.//*/div[@class="col-12"]/div/div/a') members = [] for mb in member_base: m = {} m['name'] = mb.xpath('.//h3/text()')[0] m['link'] = mb.xpath('.//@href')[0] m['headshot'] = mb.xpath('.//img/@src')[0] post_base = mb.xpath('.//p/span/text()')[0] m['post'] = post_base.replace('Commissioner', '').strip() if ',' in m['post']: m['role'] = m['post'].split(',')[1] m['post'] = m['post'].split(',')[0] else: m['role'] = 'Commisioner' member = Person(name=m['name'], role=m['role']) member.add_source(url=parks) member.add_term('Commissioner', 'legislature', org_name='Minneapolis Parks and Recreation', district=m['post']) yield member
def scrape(self): ''' Scrape the web to create a dict with all active organizations. Then, we can access the correct URL for the organization detail page. ''' web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx' web_info = {} for _, organizations in web_scraper.councilMembers(): for organization, _, _ in organizations: organization_name = organization['Department Name'][ 'label'].strip() organization_info = organization['Department Name'] web_info[organization_name] = organization_info body_types = self.body_types() board_of_directors, = [ body for body in self.bodies() if body['BodyName'] == 'Board of Directors - Regular Board Meeting' ] board_of_directors["BodyName"] = "Board of Directors" terms = collections.defaultdict(list) for office in self.body_offices(board_of_directors): terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role not in {'Board Member', 'non-voting member'}: p.add_term( role, 'legislature', start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate']), appointment=True) if role != 'Chief Executive Officer': if role == 'non-voting member': member_type = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) else: member_type = 'Board Member' post = VOTING_POSTS.get(member) start_date = self.toDate(term['OfficeRecordStartDate']) end_date = self.toDate(term['OfficeRecordEndDate']) board_membership = p.add_term(member_type, 'legislature', district=post, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: board_membership.extras = {'acting': 'true'} # Each term contains first and last names. This should be the same # across all of a person's terms, so go ahead and grab them from the # last term in the array. p.family_name = term['OfficeRecordLastName'] p.given_name = term['OfficeRecordFirstName'] # Defensively assert that the given and family names match the # expected value. if member == 'Hilda L. Solis': # Given/family name does not contain middle initial. assert p.given_name == 'Hilda' and p.family_name == 'Solis' else: assert member == ' '.join([p.given_name, p.family_name]) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] in ( body_types['Committee'], body_types['Independent Taxpayer Oversight Committee']): organization_name = body['BodyName'].strip() o = Organization(organization_name, classification='committee', parent_id={'name': 'Board of Directors'}) organization_info = web_info.get(organization_name, {}) organization_url = organization_info.get( 'url', self.WEB_URL + 'https://metro.legistar.com/Departments.aspx') o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(organization_url, note='web') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in BOARD_OFFICE_ROLES: if role == 'non-voting member': role = 'Nonvoting Member' else: role = 'Member' person = office['OfficeRecordFullName'] if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p start_date = self.toDate(office['OfficeRecordStartDate']) end_date = self.toDate(office['OfficeRecordEndDate']) membership = p.add_membership(organization_name, role=role, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: membership.extras = {'acting': 'true'} yield o for p in members.values(): yield p
def get_organizations(self): org = Organization(name="Pittsburgh City Council", classification="legislature") for x in range(1, 10): org.add_post( "District {}".format(x), "Councilmember", division_id="ocd-division/country:us/state:pa/place:pittsburgh/council_district:{}".format(x)) yield org standing_committee = Organization(name="Standing Committee", classification="committee") standing_committee.add_source("http://pittsburghpa.gov/council/standing-committees", note="web") yield standing_committee mayor = Organization(name="Mayor", classification="executive") mayor.add_post("Mayor", "Mayor", division_id="ocd-division/country:us/state:pa/place:pittsburgh") mayor.add_source("http://pittsburghpa.gov/mayor/index.html", note="web") yield mayor # TODO: figure out disambiguation for councilman/mayor positions (using birth_date?) # peduto = Person(name="William Peduto", birth_date=datetime.date(1964, 10, 30)) # peduto.add_term("Mayor", # "executive", # start_date=datetime.date(2014, 1 ,6), # appointment=True) # peduto.add_source("http://pittsburghpa.gov/mayor/mayor-profile") # yield peduto # ravenstahl = Person(name="Luke Ravenstahl", birth_date=datetime.date(1980, 2, 6)) # ravenstahl.add_term("Mayor", # "executive", # start_date=datetime.date(2006, 9, 1), # end_date=datetime.date(2014, 1 ,6), # appointment=True) # ravenstahl.add_source("https://www.post-gazette.com/local/city/2006/09/01/Ravenstahl-sworn-in-as-Pittsburgh-mayor/stories/200609010229") # yield ravenstahl city_clerk = Organization(name="City Clerk", classification="department") city_clerk.add_post("City Clerk", "City Clerk", division_id="ocd-division/country:us/state:pa/place:pittsburgh") city_clerk.add_source("http://pittsburghpa.gov/clerk/", note="web") yield city_clerk pree = Person(name="Brenda Pree") pree.add_term("City Clerk", "department", start_date=datetime.date(2017, 8, 29), appointment=True) pree.add_source("http://pittsburghpa.gov/clerk/clerk-bio") yield pree doheny = Person(name="Mary Beth Doheny") doheny.add_term("City Clerk", "department", start_date=datetime.date(2014, 3, 18), end_date=datetime.date(2017, 8, 28), appointment=True) doheny.add_source("http://pittsburghpa.gov") yield doheny # "All Members", frustratingly, has a Person entry in Pittsburgh # Legistar, so the import trips without this. Going strong since 1816! all_members = Person(name="All Members") all_members.add_term("City Council", "legislature", start_date=datetime.date(1816, 3, 18)) all_members.add_source("http://pittsburghpa.gov/council/index.html") yield all_members
def scrape(self): url = 'http://alpha.openstates.org/graphql' scrapers = [ { 'query': '{ people(memberOf:"ocd-organization/e91db6f8-2232-49cd-91af-fdb5adb4ac3b", first: 100) { edges { node { name party: currentMemberships(classification:"party") { organization { name }} links { url } sources { url } chamber: currentMemberships(classification:["upper", "lower"]) { post { label } organization { name classification parent { name }}}}}}}' }, # { 'query': '{ people(memberOf:"ocd-organization/e91db6f8-2232-49cd-91af-fdb5adb4ac3b", last: 100) { edges { node { name party: currentMemberships(classification:"party") { organization { name }} links { url } sources { url } chamber: currentMemberships(classification:["upper", "lower"]) { post { label } organization { name classification parent { name }}}}}}}'}, { 'query': '{ people(memberOf:"ocd-organization/6a026144-758d-4d57-b856-9c60dce3c4b5", first: 100) { edges { node { name party: currentMemberships(classification:"party") { organization { name }} links { url } sources { url } chamber: currentMemberships(classification:["upper", "lower"]) { post { label } organization { name classification parent { name }}}}}}}' }, ] base = requests.get(url=url, json=scrapers[0]) base = base.json() ppl = base['data']['people']['edges'] for p in ppl: p = p['node'] if p['name'] in rep_names: rep_names.remove(p['name']) # Get names unretrieved from primary House API Query print('REP NAMES: ', rep_names) rep_names.remove('Gene Pelowski') for rep in rep_names: query = '{ people(memberOf:"ocd-organization/e91db6f8-2232-49cd-91af-fdb5adb4ac3b", first: 100, name: "' + rep + '") { edges { node { name party: currentMemberships(classification:"party") { organization { name }} links { url } sources { url } chamber: currentMemberships(classification:["upper", "lower"]) { post { label } organization { name classification parent { name }}}}}}}' query = {'query': query} scrapers.append(query) for s in scrapers: base = requests.get(url=url, json=s) base = base.json() print(base) ppl = base['data']['people']['edges'] for p in ppl: p = p['node'] orgs = p['chamber'] rep = Person(name=p['name'], role='State Representative') for o in orgs: ppr(o) name = o['organization']['name'] classification = o['organization']['classification'] if o['organization']['parent']: pname = o['organization']['parent']['name'] if pname == 'Minnesota Legislature': label = o['post']['label'] if 'House' in name: role = 'State Representative' elif 'Senate' in name: role = 'State Senator' rep.add_term(role, classification, district=label, org_name=name) rep.add_source(p['sources'][0]['url']) else: rep.add_membership(name) rep.add_source(p['sources'][0]['url']) yield rep
def scrape(self): body_types = self.body_types() city_council, = [body for body in self.bodies() if body['BodyName'] == 'City Council'] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if 'vacan' not in office['OfficeRecordFullName'].lower(): terms[office['OfficeRecordFullName'].strip()].append(office) web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute) web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881' web_scraper.ALL_MEMBERS = '3:3' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers({'ctl00$ContentPlaceHolder$lstName' : 'City Council'}): web_info[member['Person Name']['label']] = member web_info['Balcer, James'] = collections.defaultdict(lambda : None) web_info['Fioretti, Bob'] = collections.defaultdict(lambda : None) web_info['Balcer, James']['Ward/Office'] = 11 web_info['Fioretti, Bob']['Ward/Office'] = 2 members = {} for member, offices in terms.items(): web = web_info[member] p = Person(member) for term in offices: role = term['OfficeRecordTitle'] p.add_term('Alderman', 'legislature', district = "Ward {}".format(int(web['Ward/Office'])), start_date = self.toDate(term['OfficeRecordStartDate']), end_date = self.toDate(term['OfficeRecordEndDate'])) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Address": ("address", "City Hall Address"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web[contact_type] and web[contact_type] != 'N/A': p.add_contact_detail(type=type_, value= web[contact_type], note=_note) if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != 'N/A': p.add_contact_detail(type="email", value=web['E-mail']['label'], note='E-mail') if web['Website']: p.add_link(web['Website']['url']) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name' : 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') for office in self.body_offices(body): # messed up record for joanna thompson if office['OfficeRecordId'] == 1055: continue role = office['OfficeRecordTitle'] if role not in ("Vice Chair", "Chairman"): role = 'Member' person = office['OfficeRecordFullName'].strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p try: end_date = self.toDate(office['OfficeRecordEndDate']) except TypeError: end_date = '' p.add_membership(body['BodyName'], role=role, start_date=self.toDate(office['OfficeRecordStartDate']), end_date=end_date) yield o for body in self.bodies(): if body['BodyTypeId'] == body_types['Joint Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name' : 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') yield o for p in members.values(): yield p
def scrape(self): body_types = self.body_types() city_council, = [ body for body in self.bodies() if body['BodyName'] == 'City Council ' ] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if office['OfficeRecordFullName'] != "Granicus BA": terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] p.add_term( role, 'legislature', # district = "District {}".format(int(web['District/Office'])), start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Standing Committees']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Sacramento City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): # messed up record for joanna thompson if office['OfficeRecordId'] == 1055: continue role = office['OfficeRecordTitle'] if role not in ("Vice Chair", "Chairperson"): role = 'Member' person = office['OfficeRecordFullName'].strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(body['BodyName'], role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for body in self.bodies(): if body['BodyTypeId'] == body_types['Boards or Commission']: o = Organization(body['BodyName'], classification='commission', parent_id={'name': 'Sacramento City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') yield o for p in members.values(): yield p
def get_organizations(self): city = Organization('City of Minneapolis', classification='executive') city.add_post('Mayor', 'Mayor', division_id='ocd-division/country:us/state:mn/place:minneapolis') city.add_post('City Clerk', 'City Clerk', division_id='ocd-division/country:us/state:mn/place:minneapolis') yield city council = Organization(name="Minneapolis City Council", classification="legislature", parent_id=city) for x in range(1, 14): council.add_post( "Ward {}".format(x), "Councilmember", division_id='ocd-division/country:us/state:mn/place:minneapolis/ward:{}'.format(x)) yield council frey = Person(name="Frey, Jacob") frey.add_term('Mayor', 'executive', start_date=datetime.date(2018, 1, 19), appointment=True) frey.add_source('http://www.google.com') yield frey parks = Organization('Minneapolis Parks and Recreation', classification='legislature') for x in range(1, 7): parks.add_post( "District {}".format(x), "Commissioner") parks.add_post("At Large", "Commissioner") yield parks school = Organization('Minneapolis School Board', classification='legislature') for x in range(1, 7): school.add_post( "District {}".format(x), "Director",) school.add_post("At Large", "Director") yield school cmt_link = 'https://lims.minneapolismn.gov/Calendar/GetCommittees' cmts_site = requests.get(cmt_link) cmts = cmts_site.json() for c in cmts: name = c['Name'] abbv = c['Abbreviation'] org_id = c['Id'] active = c['Active'] member_count = c['MembersCount'] purpose = c['Purpose'] start_date = c['StartDate'] chair = c['ChairMan'] members = c['Members'] location = c['Location'] address = c['Address'] mtg_time = c['MeetingTime'] quorum = c['QuorumCount'] if name != 'City Council': org = Organization(name, classification='committee', parent_id=council) org.add_source(cmt_link) if start_date != None: org.founding_date = start_date.split('T')[0] yield org
def scrape(self): web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute) web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers(): name = member['Person Name']['label'].strip() web_info[name] = member city_council, = [body for body in self.bodies() if body['BodyName'] == 'City Council'] terms = collections.defaultdict(list) public_advocates = { # Match casing to Bill De Blasio as council member 'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio', 'The Public Advocate (Ms. James)': 'Letitia James', } for office in self.body_offices(city_council): name = office['OfficeRecordFullName'] name = public_advocates.get(name, name).strip() terms[name].append(office) # Add past members (and advocates public) if name not in web_info: web_info[name] = collections.defaultdict(lambda: None) # Check that we have everyone we expect, formatted consistently, in # both information arrays. For instance, this will fail if we forget to # strip trailing spaces from names on one side or the other (which has # the effect of omitting information, such as post, from the scrape). assert set(web_info.keys()) == set(terms.keys()) members = {} for member, offices in terms.items(): p = Person(member) web = web_info[member] for term in offices: role = term['OfficeRecordTitle'] if role == 'Public Advocate': role = 'Non-Voting Council Member' else: role = 'Council Member' district = web.get('District', '').replace(' 0', ' ') p.add_term(role, 'legislature', district=district, start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) party = web.get('Political Party') if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web.get(contact_type) and web(contact_type) != 'N/A': p.add_contact_detail(type=type_, value= web[contact_type], note=_note) if web.get('E-mail'): p.add_contact_detail(type="email", value=web['E-mail']['url'], note='E-mail') if web.get('Web site'): p.add_link(web['Web site']['url'], note='web site') if web.get('Notes'): p.extras = {'Notes': web['Notes']} if not p.sources: # Only add sources once source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p committee_types = ['Committee', 'Inactive Committee', 'Select Committee', 'Subcommittee', 'Task Force', 'Land Use', # Committee on Land Use ] body_types = {k: v for k, v in self.body_types().items() if k in committee_types} for body in self.bodies(): if body['BodyTypeName'] in body_types \ or body['BodyName'] in ('Legislative Documents Unit', 'Legal and Government Affairs Division'): # Skip typo in API data if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services': continue parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council') body_name = body['BodyName'] o = Organization(body_name, classification='committee', parent_id={'name': parent_org}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') for office in self.body_offices(body): # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio', # 'Committee Member', None, 'CHAIRPERSON' role = office['OfficeRecordTitle'] if role and role.lower() == 'chairperson': role = 'Chairperson' else: role = 'Member' person = office['OfficeRecordFullName'] person = public_advocates.get(person, person).strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(o, role=role, start_date=self.toDate(office['OfficeRecordStartDate']), end_date=self.toDate(office['OfficeRecordEndDate'])) yield o for p in members.values(): yield p
def get_organizations(self): org = Organization(name="Chicago City Council", classification="legislature") for x in range(1, 51): org.add_post( "Ward {}".format(x), "Alderman", division_id='ocd-division/country:us/state:il/place:chicago/ward:{}'.format(x)) yield org city = Organization('City of Chicago', classification='executive') city.add_post('Mayor', 'Mayor', division_id='ocd-division/country:us/state:il/place:chicago') city.add_post('City Clerk', 'City Clerk', division_id='ocd-division/country:us/state:il/place:chicago') yield city daley = Person(name="Daley, Richard M.") daley.add_term('Mayor', 'executive', start_date=datetime.date(1989, 4, 24), end_date=datetime.date(2011, 5, 16), appointment=True) daley.add_source('https://chicago.legistar.com/People.aspx') yield daley emanuel = Person(name="Emanuel, Rahm") emanuel.add_term('Mayor', 'executive', start_date=datetime.date(2011, 5, 16), appointment=True) emanuel.add_source('https://chicago.legistar.com/People.aspx') yield emanuel mendoza = Person(name='Mendoza, Susana A.') mendoza.add_term('City Clerk', 'executive', start_date=datetime.date(2011, 5, 16), end_date=datetime.date(2016, 12, 4), appointment=True) mendoza.add_source('https://chicago.legistar.com/People.aspx') yield mendoza valle = Person(name='Del Valle, Miguel') valle.add_term('City Clerk', 'executive', start_date=datetime.date(2006, 12, 1), end_date=datetime.date(2011, 5, 16), appointment=True) valle.add_source('https://chicago.legistar.com/People.aspx') yield valle valencia = Person(name='Valencia, Anna M.') valencia.add_term(role='City Clerk', org_classification='executive', start_date=datetime.date(2017, 1, 25), end_date=datetime.date(2019, 5, 20), appointment=True) valencia.add_source('https://chicago.legistar.com/People.aspx') yield valencia
def scrape(self): body_types = self.body_types() city_council, = [ body for body in self.bodies() if body['BodyName'] == 'City Council' ] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if 'VACAN' not in office['OfficeRecordFullName']: terms[office['OfficeRecordFullName'].strip()].append(office) web_scraper = LegistarPersonScraper(None, None) web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881' web_scraper.ALL_MEMBERS = '3:3' web_info = {} for member, _ in web_scraper.councilMembers( {'ctl00$ContentPlaceHolder$lstName': 'City Council'}): web_info[member['Person Name']['label']] = member web_info['Balcer, James'] = collections.defaultdict(lambda: None) web_info['Fioretti, Bob'] = collections.defaultdict(lambda: None) web_info['Balcer, James']['Ward/Office'] = 11 web_info['Fioretti, Bob']['Ward/Office'] = 2 members = {} for member, offices in terms.items(): web = web_info[member] p = Person(member) for term in offices: role = term['OfficeRecordTitle'] p.add_term('Alderman', 'legislature', district="Ward {}".format(int(web['Ward/Office'])), start_date=self.toDate( term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Address": ("address", "City Hall Address"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web[contact_type] and web[contact_type] != 'N/A': p.add_contact_detail(type=type_, value=web[contact_type], note=_note) if web["E-mail"] and web["E-mail"][ "label"] and web["E-mail"]["label"] != 'N/A': p.add_contact_detail(type="email", value=web['E-mail']['label'], note='E-mail') if web['Website']: p.add_link(web['Website']['url']) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): # messed up record for joanna thompson if office['OfficeRecordId'] == 1055: continue role = office['OfficeRecordTitle'] if role not in ("Vice Chair", "Chairman"): role = 'Member' person = office['OfficeRecordFullName'].strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(body['BodyName'], role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for body in self.bodies(): if body['BodyTypeId'] == body_types['Joint Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') yield o for p in members.values(): yield p
def scrape(self): body_types = self.body_types() city_council, = [body for body in self.bodies() if body["BodyName"] == "City Council"] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if "VACAN" not in office["OfficeRecordFullName"]: terms[office["OfficeRecordFullName"].strip()].append(office) web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = "https://pittsburgh.legistar.com/People.aspx" web_scraper.COMMITTEELIST = "https://pittsburgh.legistar.com/Departments.aspx" if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member in web_scraper.councilMembers(): web_info[member["Person Name"]] = member members = {} for member, offices in terms.items(): person = Person(member) for term in offices: role = term["OfficeRecordTitle"] person.add_term("Councilmember", "legislature", start_date = self.toDate(term["OfficeRecordStartDate"]), end_date = self.toDate(term["OfficeRecordEndDate"])) if member in web_info: web = web_info[member] if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != "N/A": person.add_contact_detail(type="email", value=web["E-mail"]["label"], note="E-mail") person_source_data = self.person_sources_from_office(term) person_api_url, person_api_response = person_source_data person.add_source(person_api_url, note="api") if person_api_response["PersonAddress1"]: address = (person_api_response["PersonAddress1"] + ", " + person_api_response["PersonCity1"] + ", " + person_api_response["PersonState1"] + " " + person_api_response["PersonZip1"]) person.add_contact_detail(type="address", value=address, note="Office address") if person_api_response["PersonPhone"]: person.add_contact_detail(type="voice", value=person_api_response["PersonPhone"], note="Office phone") if person_api_response["PersonWWW"]: person.add_contact_detail(type="url", value=person_api_response["PersonWWW"], note="District website") members[member] = person for body in self.bodies(): if body["BodyTypeId"] == body_types["Committee"]: body_name_clean = body["BodyName"].strip() organization = Organization(body_name_clean, classification="committee", parent_id={"name" : "Pittsburgh City Council"}) organization.add_source(self.BASE_URL + "/bodies/{BodyId}".format(**body), note="api") for office in self.body_offices(body): role = office["OfficeRecordMemberType"] if role not in ("Vice Chair", "Chair") or role == "Councilmember": role = "Member" person = office["OfficeRecordFullName"].strip() if person in members: person = members[person] else: person = Person(person) person.add_membership(body_name_clean, role=role, start_date = self.toDate(office["OfficeRecordStartDate"]), end_date = self.toDate(office["OfficeRecordEndDate"])) yield organization for person in members.values(): yield person
def scrape(self): body_types = self.body_types() board_of_directors, = [ body for body in self.bodies() if body['BodyName'] == 'Board of Directors - Regular Board Meeting' ] board_of_directors["BodyName"] = "Board of Directors" terms = collections.defaultdict(list) for office in self.body_offices(board_of_directors): terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role not in {'Board Member', 'non-voting member'}: p.add_term( role, 'legislature', start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate']), appointment=True) if role != 'Chief Executive Officer': if role == 'non-voting member': member_type = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) else: member_type = 'Board Member' post = VOTING_POSTS.get(member) p.add_term( member_type, 'legislature', district=post, start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: org_name = body['BodyName'].strip() o = Organization(org_name, classification='committee', parent_id={'name': 'Board of Directors'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in ("Chair", "Vice Chair"): if role == 'non-voting member': role = 'Nonvoting Member' else: role = 'Member' person = office['OfficeRecordFullName'] if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(org_name, role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for p in members.values(): yield p
def scrape(self): ''' Scrape the web to create a dict with all active organizations. Then, we can access the correct URL for the organization detail page. ''' web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx' web_info = {} for _, organizations in web_scraper.councilMembers(): for organization, _, _ in organizations: organization_name = organization['Department Name']['label'].strip() organization_info = organization['Department Name'] web_info[organization_name] = organization_info body_types = self.body_types() board_of_directors, = [body for body in self.bodies() if body['BodyName'] == 'Board of Directors - Regular Board Meeting'] board_of_directors["BodyName"] = "Board of Directors" terms = collections.defaultdict(list) for office in self.body_offices(board_of_directors): terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role not in {'Board Member', 'non-voting member'}: p.add_term(role, 'legislature', start_date = self.toDate(term['OfficeRecordStartDate']), end_date = self.toDate(term['OfficeRecordEndDate']), appointment = True) if role != 'Chief Executive Officer': if role == 'non-voting member': member_type = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) else: member_type = 'Board Member' post = VOTING_POSTS.get(member) start_date = self.toDate(term['OfficeRecordStartDate']) end_date = self.toDate(term['OfficeRecordEndDate']) board_membership = p.add_term(member_type, 'legislature', district = post, start_date = start_date, end_date = end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(p.name) if acting_member_end_date and acting_member_end_date <= end_date: board_membership.extras = {'acting': 'true'} source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: organization_name = body['BodyName'].strip() o = Organization(organization_name, classification='committee', parent_id={'name' : 'Board of Directors'}) organization_info = web_info.get(organization_name, {}) organization_url = organization_info.get('url', self.WEB_URL + 'https://metro.legistar.com/Departments.aspx') o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(organization_url, note='web') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in ("Chair", "Vice Chair", "Chief Executive Officer"): if role == 'non-voting member': role = 'Nonvoting Member' else: role = 'Member' person = office['OfficeRecordFullName'] if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p start_date = self.toDate(office['OfficeRecordStartDate']) end_date = self.toDate(office['OfficeRecordEndDate']) membership = p.add_membership(organization_name, role=role, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(p.name) if acting_member_end_date and acting_member_end_date <= end_date: membership.extras = {'acting': 'true'} yield o for p in members.values(): yield p
def scrape(self): noncommittees = {'Committee of the Whole'} committee_d = {} people_d = {} # Go to memberlist extra_args = {'ctl00$ContentPlaceHolder$lstName': 'City Council'} for councilman, committees in self.councilMembers( extra_args=extra_args): if 'url' in councilman['Person Name']: councilman_url = councilman['Person Name']['url'] if councilman_url in people_d: people_d[councilman_url][0].append(councilman) else: people_d[councilman_url] = [councilman], committees for person_entries, committees in people_d.values(): councilman = person_entries[-1] p = Person(councilman['Person Name']['label']) if p.name == 'Letitia James': p.name = 'Letitia Ms. James' p.add_name('Letitia James') spans = [(self.toTime(entry['Start Date']).date(), self.toTime(entry['End Date']).date(), entry['District']) for entry in person_entries] merged_spans = [] last_end_date = None last_district = None for start_date, end_date, district in sorted(spans): if last_end_date is None: span = [start_date, end_date, district] elif (start_date - last_end_date ) == datetime.timedelta(1) and district == last_district: span[1] = end_date else: merged_spans.append(span) span = [start_date, end_date, district] last_end_date = end_date last_district = district merged_spans.append(span) for start_date, end_date, district in merged_spans: district = councilman['District'].replace(' 0', ' ') if end_date == datetime.date(2017, 12, 31): end_date = '' else: end_date = end_date.isoformat() print(start_date, end_date) p.add_term('Council Member', 'legislature', district=district, start_date=start_date.isoformat(), end_date=end_date) party = councilman['Political Party'] if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if councilman['Photo']: p.image = councilman['Photo'] if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['url'], note='E-mail') if councilman['Web site']: p.add_link(councilman['Web site']['url'], note='web site') p.extras = {'Notes': councilman['Notes']} p.add_source(councilman['Person Name']['url'], note='web') for committee, _, _ in committees: committee_name = committee['Department Name']['label'] if committee_name not in noncommittees and 'committee' in committee_name.lower( ): o = committee_d.get(committee_name, None) if o is None: parent_id = PARENT_ORGS.get(committee_name, 'New York City Council') o = Organization(committee_name, classification='committee', parent_id={'name': parent_id}) o.add_source(committee['Department Name']['url']) committee_d[committee_name] = o membership = o.add_member(p, role=committee["Title"]) membership.start_date = self.mdY2Ymd( committee["Start Date"]) yield p for o in committee_d.values(): if 'Committee' in o.name: yield o for o in committee_d.values(): if 'Subcommittee' in o.name: yield o o = Organization( 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services', classification='committee', parent_id={'name': 'New York City Council'}) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o o = Organization( 'Subcommittee on Drug Abuse', classification='committee', parent_id={ 'name': 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services' }) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o
def scrape(self): committee_d = {} non_committees = { 'City Council', 'Office of the Mayor', 'Office of the City Clerk' } for councilman, committees in self.councilMembers(): if councilman['Ward/Office'] == "": continue ward = councilman['Ward/Office'] if ward not in {"Mayor", "Clerk"}: ward = "Ward {}".format(int(ward)) role = "Alderman" p = Person(councilman['Person Name']['label'], district=ward, primary_org="legislature", role=role) if councilman['Photo']: p.image = councilman['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if councilman[contact_type]: p.add_contact_detail(type=type_, value=councilman[contact_type], note=_note) if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['label'], note='E-mail') if councilman['Website']: p.add_link(councilman['Website']['url']) p.add_source(councilman['Person Name']['url'], note='web') for committee, _, _ in committees: committee_name = committee['Legislative Body']['label'] if committee_name and committee_name not in non_committees: o = committee_d.get(committee_name, None) if o is None: o = Organization( committee_name, classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source(committee['Legislative Body']['url'], note='web') committee_d[committee_name] = o o.add_member(p, role=committee["Title"]) yield p for name, term in FORMER_ALDERMEN.items(): p = Person(name=name, primary_org="legislature", start_date=term['term'][0], end_date=term['term'][1], district="Ward {}".format(term['ward']), role='Alderman') if name == 'Chandler, Michael D.': p.add_term('Alderman', "legislature", district="Ward {}".format(term['ward']), start_date=datetime.date(2011, 5, 16), end_date=datetime.date(2015, 5, 18)) p.add_source(term['source'], note='web') yield p for o in committee_d.values(): yield o for committee_name in FORMER_COMMITTEES: o = Organization(committee_name, classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source("https://chicago.legistar.com/Departments.aspx", note='web') yield o for joint_committee in JOINT_COMMITTEES: o = Organization(joint_committee, classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source("https://chicago.legistar.com/Departments.aspx", note='web') yield o
def scrape(self): committee_d = {} non_committees = {'City Council', 'Office of the Mayor', 'Office of the City Clerk'} for councilman, committees in self.councilMembers() : if councilman['Ward/Office'] == "": continue ward = councilman['Ward/Office'] if ward not in {"Mayor", "Clerk"} : ward = "Ward {}".format(int(ward)) role = "Alderman" p = Person(councilman['Person Name']['label'], district=ward, primary_org="legislature", role=role) if councilman['Photo'] : p.image = councilman['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if councilman[contact_type]: p.add_contact_detail(type=type_, value= councilman[contact_type], note=_note) if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['label'], note='E-mail') if councilman['Website']: p.add_link(councilman['Website']['url']) p.add_source(councilman['Person Name']['url'], note='web') for committee, _, _ in committees: committee_name = committee['Legislative Body']['label'] if committee_name and committee_name not in non_committees: o = committee_d.get(committee_name, None) if o is None: o = Organization(committee_name, classification='committee', parent_id={'name' : 'Chicago City Council'}) o.add_source(committee['Legislative Body']['url'], note='web') committee_d[committee_name] = o o.add_member(p, role=committee["Title"]) yield p for name, term in FORMER_ALDERMEN.items() : p = Person(name=name, primary_org="legislature", start_date=term['term'][0], end_date=term['term'][1], district="Ward {}".format(term['ward']), role='Alderman') if name == 'Chandler, Michael D.' : p.add_term('Alderman', "legislature", district="Ward {}".format(term['ward']), start_date=datetime.date(2011, 5, 16), end_date=datetime.date(2015, 5, 18)) p.add_source(term['source'], note='web') yield p for o in committee_d.values() : yield o for committee_name in FORMER_COMMITTEES : o = Organization(committee_name, classification='committee', parent_id={'name' : 'Chicago City Council'}) o.add_source("https://chicago.legistar.com/Departments.aspx", note='web') yield o for joint_committee in JOINT_COMMITTEES : o = Organization(joint_committee, classification='committee', parent_id={'name' : 'Chicago City Council'}) o.add_source("https://chicago.legistar.com/Departments.aspx", note='web') yield o
def get_organizations(self): org = Organization(name="Pittsburgh City Council", classification="legislature") for x in range(1, 10): org.add_post( label="District {}".format(str(x)), role="Councilmember", division_id= "ocd-division/country:us/state:pa/place:pittsburgh/council_district:{}" .format(x)) yield org mayor = Organization(name="Mayor", classification="executive") mayor.add_post( "Mayor", "Mayor", division_id="ocd-division/country:us/state:pa/place:pittsburgh") mayor.add_source("http://pittsburghpa.gov/mayor/index.html", note="web") yield mayor standing_committee = Organization(name="Standing Committee", classification="committee") standing_committee.add_source( "http://pittsburghpa.gov/council/standing-committees", note="web") yield standing_committee # there are a number of committees that no longer exist but have old bills attached to them construction_committee = Organization( name="Committee on Engineering & Construction", classification="committee") construction_committee.add_source(self.url, note="web") yield construction_committee forestry_committee = Organization( name="Committee on Engineering, Fleet and Forestry", classification="committee") forestry_committee.add_source(self.url, note="web") yield forestry_committee facilities_committee = Organization( name="Committee on Facilities, Technology & the Arts", classification="committee") facilities_committee.add_source(self.url, note="web") yield facilities_committee budget_committee = Organization(name="Committee on Finance & Budget", classification="committee") budget_committee.add_source(self.url, note="web") yield budget_committee purchasing_committee = Organization( name="Committee on Finance, Law and Purchasing", classification="committee") purchasing_committee.add_source(self.url, note="web") yield purchasing_committee govt_services_committee = Organization( name="Committee on General and Government Services", classification="committee") govt_services_committee.add_source(self.url, note="web") yield govt_services_committee telecom_committee = Organization( name="Committee on General Services & Telecommunications", classification="committee") telecom_committee.add_source(self.url, note="web") yield telecom_committee arts_committee = Organization( name="Committee on General Services, Technology & the Arts", classification="committee") arts_committee.add_source(self.url, note="web") yield arts_committee housing_committee = Organization( name="Committee on Housing, Economic Development & Promotion", classification="committee") housing_committee.add_source(self.url, note="web") yield housing_committee parks_committee = Organization( name="Committee on Parks, Recreation & Youth Policy", classification="committee") parks_committee.add_source(self.url, note="web") yield parks_committee zoning_committee = Organization( name="Committee on Planning, Zoning & Land Use", classification="committee") zoning_committee.add_source(self.url, note="web") yield zoning_committee env_committee = Organization( name="Committee on Public Works & Environmental Services", classification="committee") env_committee.add_source(self.url, note="web") yield env_committee # for whatever reason these the clerk's office has also classified these next 3 as committees in Legistar mayor_agenda = Organization( name="Mayor's Agenda - Legislation to be Presented", classification="committee") mayor_agenda.add_source(self.url, note="web") yield mayor_agenda post_agenda = Organization(name="Post Agenda", classification="committee") post_agenda.add_source(self.url, note="web") yield post_agenda post_agenda_mtg = Organization(name="Post Agenda Meeting", classification="committee") post_agenda_mtg.add_source(self.url, note="web") yield post_agenda_mtg hearing_sched = Organization(name="PUBLIC HEARING SCHEDULE", classification="committee") hearing_sched.add_source(self.url, note="web") yield hearing_sched executive_session = Organization(name="Executive Session", classification="committee") executive_session.add_source(self.url, note="web") yield executive_session city_clerk = Organization(name="City Clerk", classification="department") city_clerk.add_post( "City Clerk", "City Clerk", division_id="ocd-division/country:us/state:pa/place:pittsburgh") city_clerk.add_source("http://pittsburghpa.gov/clerk/", note="web") yield city_clerk pree = Person(name="Brenda Pree") pree.add_term("City Clerk", "department", start_date=datetime.date(2017, 8, 29), appointment=True) pree.add_source("http://pittsburghpa.gov/clerk/clerk-bio") yield pree doheny = Person(name="Mary Beth Doheny") doheny.add_term("City Clerk", "department", start_date=datetime.date(2014, 3, 18), end_date=datetime.date(2017, 8, 28), appointment=True) doheny.add_source("http://pittsburghpa.gov") yield doheny # "All Members", frustratingly, has a Person entry in Pittsburgh # Legistar, so the import trips without this. Going strong since 1816! all_members = Person(name="All Members") all_members.add_term("City Council", "legislature", start_date=datetime.date(1816, 3, 18)) all_members.add_source("http://pittsburghpa.gov/council/index.html") yield all_members