def get_member(self, session, chamber, kpid): url = "%smembers/%s" % (ksapi.url, kpid) content = json.loads(self.get(url).text)["content"] party = content["PARTY"] if party == "Democrat": party = "Democratic" slug = { "2013-2014": "b2013_14", "2015-2016": "b2015_16", "2017-2018": "b2017_18", "2019-2020": "b2019_20", }[session] leg_url = "http://www.kslegislature.org/li/%s/members/%s/" % (slug, kpid) try: legislator_page = self.lxmlize(leg_url) (photo_url, ) = legislator_page.xpath('//img[@class="profile-picture"]/@src') except scrapelib.HTTPError: self.warning("{}'s legislator bio page not found".format( content["FULLNAME"])) leg_url = "" photo_url = "" person = Person( name=content["FULLNAME"], district=str(content["DISTRICT"]), primary_org=chamber, party=party, image=photo_url, ) person.extras = {"occupation": content["OCCUPATION"]} address = "\n".join([ "Room {}".format(content["OFFICENUM"]), "Kansas State Capitol Building", "300 SW 10th St.", "Topeka, KS 66612", ]) note = "Capitol Office" person.add_contact_detail(type="address", value=address, note=note) person.add_contact_detail(type="email", value=content["EMAIL"], note=note) if content["OFFPH"]: person.add_contact_detail(type="voice", value=content["OFFPH"], note=note) person.add_source(url) person.add_link(leg_url) yield person
def get_member(self, session, chamber, kpid): url = '%smembers/%s' % (ksapi.url, kpid) content = json.loads(self.get(url).text)['content'] party = content['PARTY'] if party == 'Democrat': party = 'Democratic' slug = { '2013-2014': 'b2013_14', '2015-2016': 'b2015_16', '2017-2018': 'b2017_18' }[session] leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid) try: legislator_page = self.lxmlize(leg_url) photo_url, = legislator_page.xpath( '//img[@class="profile-picture"]/@src') except scrapelib.HTTPError: self.warning("{}'s legislator bio page not found".format( content['FULLNAME'])) leg_url = '' photo_url = '' person = Person( name=content['FULLNAME'], district=str(content['DISTRICT']), primary_org=chamber, party=party, image=photo_url, ) person.extras = {'occupation': content['OCCUPATION']} address = '\n'.join([ 'Room {}'.format(content['OFFICENUM']), 'Kansas State Capitol Building', '300 SW 10th St.', 'Topeka, KS 66612', ]) note = 'Capitol Office' person.add_contact_detail(type='address', value=address, note=note) person.add_contact_detail(type='email', value=content['EMAIL'], note=note) if content['OFFPH']: person.add_contact_detail(type='voice', value=content['OFFPH'], note=note) person.add_source(url) person.add_link(leg_url) yield person
def get_member(self, session, chamber, kpid): url = '%smembers/%s' % (ksapi.url, kpid) content = json.loads(self.get(url).text)['content'] party = content['PARTY'] if party == 'Democrat': party = 'Democratic' slug = {'2013-2014': 'b2013_14', '2015-2016': 'b2015_16', '2017-2018': 'b2017_18', '2019-2020': 'b2019_20', }[session] leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid) try: legislator_page = self.lxmlize(leg_url) photo_url, = legislator_page.xpath( '//img[@class="profile-picture"]/@src') except scrapelib.HTTPError: self.warning("{}'s legislator bio page not found".format(content['FULLNAME'])) leg_url = '' photo_url = '' person = Person( name=content['FULLNAME'], district=str(content['DISTRICT']), primary_org=chamber, party=party, image=photo_url, ) person.extras = {'occupation': content['OCCUPATION']} address = '\n'.join([ 'Room {}'.format(content['OFFICENUM']), 'Kansas State Capitol Building', '300 SW 10th St.', 'Topeka, KS 66612', ]) note = 'Capitol Office' person.add_contact_detail(type='address', value=address, note=note) person.add_contact_detail(type='email', value=content['EMAIL'], note=note) if content['OFFPH']: person.add_contact_detail(type='voice', value=content['OFFPH'], note=note) person.add_source(url) person.add_link(leg_url) yield person
def _scrape_lower_chamber(self): self.info('Scraping lower chamber for legislators.') chamber = 'lower' roster_url = (self._reps_url) page = self.get(roster_url).text page = lxml.html.fromstring(page) # This is the ASP.net table container table_xpath = ('id("ContentPlaceHolder1_' 'gridMembers_DXMainTable")') table = page.xpath(table_xpath)[0] for tr in table.xpath('tr')[1:]: # If a given term hasn't occurred yet, then ignore it # Eg, in 2017, the 2018 term page will have a blank table if tr.attrib.get('class') == 'dxgvEmptyDataRow': self.warning('No House members found') return tds = tr.xpath('td') last_name = tds[0].text_content().strip() first_name = tds[1].text_content().strip() full_name = '{} {}'.format(first_name, last_name) district = str(int(tds[2].text_content().strip())) party = tds[3].text_content().strip() if party == 'Democrat': party = 'Democratic' if party.strip() == "": # Workaround for now. party = "Other" phone = tds[4].text_content().strip() room = tds[5].text_content().strip() address = self._assumed_address_fmt.format(room if room else '') if last_name == 'Vacant': person = Person( name=full_name, primary_org=chamber, district=district, party=party, ) person.extras = { 'first_name': first_name, 'last_name': last_name, } person.add_contact_detail(type='address', value=address, note='Capitol Office') if phone.strip(): person.add_contact_detail(type='voice', value=phone, note='Capitol Office') person.add_source(roster_url) self._save_vacant_legislator(person) else: party_override = { " Green": "Democratic", " Sisco": "Republican" } if party == "" and full_name in party_override: party = party_override[full_name] details_url = self._rep_details_url.format(district) details_page = lxml.html.fromstring(self.get(details_url).text) person = Person( name=full_name, primary_org=chamber, district=district, party=party, ) person.extras = { 'first_name': first_name, 'last_name': last_name, } person.add_source(roster_url) person.add_source(details_url) person.add_link(details_url) email = details_page.xpath( '//*[@id="ContentPlaceHolder1_lblAddresses"]' '/table/tr[4]/td/a/@href') if len(email) > 0 and email[0].lower() != 'mailto:': email = email[0].split(':')[1] else: email = None person.add_contact_detail(type='address', value=address, note='Capitol Office') if phone: person.add_contact_detail(type='voice', value=phone, note='Capitol Office') if email: person.add_contact_detail(type='email', value=email, note='Capitol Office') picture = details_page.xpath( '//*[@id="ContentPlaceHolder1_imgPhoto"]/@src') if len(picture) > 0: person.image = picture[0] yield person
def _parse_person(self, row, chamber, seat_map): # Capture legislator vitals. first_name = row['FirstName'] middle_name = row['MiddleName'] last_name = row['LastName'] full_name = '{} {} {}'.format(first_name, middle_name, last_name) full_name = re.sub(r'[\s]{2,}', ' ', full_name) if chamber == 'lower': district = '{} {}'.format(row['County'], int(row['District'])).strip() else: district = str(int(row['District'])).strip() party = self.party_map[row['party'].upper()] email = row['WorkEmail'] print(district) person = Person(primary_org=chamber, district=district, name=full_name, party=party) extras = { 'first_name': first_name, 'middle_name': middle_name, 'last_name': last_name } person.extras = extras if email: person.add_contact_detail(type='email', value=email, note='District Office') # Capture legislator office contact information. district_address = '{}\n{}\n{}, {} {}'.format(row['Address'], row['address2'], row['city'], row['State'], row['Zipcode']).strip() phone = row['Phone'].strip() if not phone: phone = None if district_address: person.add_contact_detail(type='address', value=district_address, note='Home Office') if phone: person.add_contact_detail(type='voice', value=phone, note='Home Office') # Retrieve legislator portrait. profile_url = None if chamber == 'upper': profile_url = self.senate_profile_url.format(row['District']) elif chamber == 'lower': try: seat_number = seat_map[row['seatno']] profile_url = self.house_profile_url.format(seat_number) except KeyError: pass if profile_url: person.image = self._get_photo(profile_url, chamber) person.add_source(profile_url) return person
def scrape(self): web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute) web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers(): name = member['Person Name']['label'].strip() web_info[name] = member city_council, = [body for body in self.bodies() if body['BodyName'] == 'City Council'] terms = collections.defaultdict(list) public_advocates = { # Match casing to Bill De Blasio as council member 'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio', 'The Public Advocate (Ms. James)': 'Letitia James', } for office in self.body_offices(city_council): name = office['OfficeRecordFullName'] name = public_advocates.get(name, name).strip() terms[name].append(office) # Add past members (and advocates public) if name not in web_info: web_info[name] = collections.defaultdict(lambda: None) # Check that we have everyone we expect, formatted consistently, in # both information arrays. For instance, this will fail if we forget to # strip trailing spaces from names on one side or the other (which has # the effect of omitting information, such as post, from the scrape). assert set(web_info.keys()) == set(terms.keys()) members = {} for member, offices in terms.items(): p = Person(member) web = web_info[member] for term in offices: role = term['OfficeRecordTitle'] if role == 'Public Advocate': role = 'Non-Voting Council Member' else: role = 'Council Member' district = web.get('District', '').replace(' 0', ' ') p.add_term(role, 'legislature', district=district, start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) party = web.get('Political Party') if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web.get(contact_type) and web(contact_type) != 'N/A': p.add_contact_detail(type=type_, value= web[contact_type], note=_note) if web.get('E-mail'): p.add_contact_detail(type="email", value=web['E-mail']['url'], note='E-mail') if web.get('Web site'): p.add_link(web['Web site']['url'], note='web site') if web.get('Notes'): p.extras = {'Notes': web['Notes']} if not p.sources: # Only add sources once source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p committee_types = ['Committee', 'Inactive Committee', 'Select Committee', 'Subcommittee', 'Task Force', 'Land Use', # Committee on Land Use ] body_types = {k: v for k, v in self.body_types().items() if k in committee_types} for body in self.bodies(): if body['BodyTypeName'] in body_types \ or body['BodyName'] in ('Legislative Documents Unit', 'Legal and Government Affairs Division'): # Skip typo in API data if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services': continue parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council') body_name = body['BodyName'] o = Organization(body_name, classification='committee', parent_id={'name': parent_org}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') for office in self.body_offices(body): # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio', # 'Committee Member', None, 'CHAIRPERSON' role = office['OfficeRecordTitle'] if role and role.lower() == 'chairperson': role = 'Chairperson' else: role = 'Member' person = office['OfficeRecordFullName'] person = public_advocates.get(person, person).strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(o, role=role, start_date=self.toDate(office['OfficeRecordStartDate']), end_date=self.toDate(office['OfficeRecordEndDate'])) yield o for p in members.values(): yield p
def scrape(self): web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers(): name = member['Person Name']['label'].strip() web_info[name] = member city_council, = [ body for body in self.bodies() if body['BodyName'] == 'City Council' ] terms = collections.defaultdict(list) public_advocates = { # Match casing to Bill De Blasio as council member 'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio', 'The Public Advocate (Ms. James)': 'Letitia James', } for office in self.body_offices(city_council): name = office['OfficeRecordFullName'] name = public_advocates.get(name, name).strip() terms[name].append(office) # Add past members (and advocates public) if name not in web_info: web_info[name] = collections.defaultdict(lambda: None) # Check that we have everyone we expect, formatted consistently, in # both information arrays. For instance, this will fail if we forget to # strip trailing spaces from names on one side or the other (which has # the effect of omitting information, such as post, from the scrape). assert set(web_info.keys()) == set(terms.keys()) members = {} for member, offices in terms.items(): p = Person(member) web = web_info[member] for term in offices: role = term['OfficeRecordTitle'] if role == 'Public Advocate': role = 'Non-Voting Council Member' else: role = 'Council Member' district = web.get('District', '').replace(' 0', ' ') p.add_term(role, 'legislature', district=district, start_date=self.toDate( term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) party = web.get('Political Party') if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web.get(contact_type) and web(contact_type) != 'N/A': p.add_contact_detail(type=type_, value=web[contact_type], note=_note) if web.get('E-mail'): p.add_contact_detail(type="email", value=web['E-mail']['url'], note='E-mail') if web.get('Web site'): p.add_link(web['Web site']['url'], note='web site') if web.get('Notes'): p.extras = {'Notes': web['Notes']} if not p.sources: # Only add sources once source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p committee_types = [ 'Committee', 'Inactive Committee', 'Select Committee', 'Subcommittee', 'Task Force', 'Land Use' ] # Committee on Land Use body_types = { k: v for k, v in self.body_types().items() if k in committee_types } for body in self.bodies(): if body['BodyTypeName'] in body_types \ or body['BodyName'] in ('Legislative Documents Unit', 'Legal and Government Affairs Division'): # Skip typo in API data if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services': continue parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council') body_name = body['BodyName'] o = Organization(body_name, classification='committee', parent_id={'name': parent_org}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio', # 'Committee Member', None, 'CHAIRPERSON' role = office['OfficeRecordTitle'] if role and role.lower() == 'chairperson': role = 'Chairperson' else: role = 'Member' person = office['OfficeRecordFullName'] person = public_advocates.get(person, person).strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(o, role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for p in members.values(): yield p
def scrape(self): noncommittees = {'Committee of the Whole'} committee_d = {} people_d = {} for councilman, committees in self.councilMembers() : if 'url' in councilman['Person Name'] : councilman_url = councilman['Person Name']['url'] if councilman_url in people_d : people_d[councilman_url][0].append(councilman) else : people_d[councilman_url] = [councilman], committees for person_entries, committees in people_d.values() : councilman = person_entries[-1] p = Person(councilman['Person Name']['label']) if p.name == 'Letitia James' : p.name = 'Letitia Ms. James' p.add_name('Letitia James') spans = [(self.toTime(entry['Start Date']).date(), self.toTime(entry['End Date']).date(), entry['District']) for entry in person_entries] merged_spans = [] last_end_date = None last_district = None for start_date, end_date, district in sorted(spans) : if last_end_date is None : span = [start_date, end_date, district] elif (start_date - last_end_date) == datetime.timedelta(1) and district == last_district : span[1] = end_date else : merged_spans.append(span) span = [start_date, end_date, district] last_end_date = end_date last_district = district merged_spans.append(span) for start_date, end_date, district in merged_spans : district = councilman['District'].replace(' 0', ' ') if end_date == datetime.date(2017, 12, 31) : end_date = '' else : end_date = end_date.isoformat() print(start_date, end_date) p.add_term('Council Member', 'legislature', district=district, start_date=start_date.isoformat(), end_date=end_date) party = councilman['Political Party'] if party == 'Democrat' : party = 'Democratic' if party : p.add_party(party) if councilman['Photo'] : p.image = councilman['Photo'] if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['url'], note='E-mail') if councilman['Web site']: p.add_link(councilman['Web site']['url'], note='web site') p.extras = {'Notes' : councilman['Notes']} p.add_source(councilman['Person Name']['url'], note='web') for committee, _, _ in committees: committee_name = committee['Department Name']['label'] if committee_name not in noncommittees and 'committee' in committee_name.lower(): o = committee_d.get(committee_name, None) if o is None: parent_id = PARENT_ORGS.get(committee_name, 'New York City Council') o = Organization(committee_name, classification='committee', parent_id={'name' : parent_id}) o.add_source(committee['Department Name']['url']) committee_d[committee_name] = o membership = o.add_member(p, role=committee["Title"]) membership.start_date = self.mdY2Ymd(committee["Start Date"]) yield p for o in committee_d.values() : if 'Committee' in o.name : yield o for o in committee_d.values() : if 'Subcommittee' in o.name : yield o o = Organization('Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services', classification='committee', parent_id={'name' : 'New York City Council'}) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o o = Organization('Subcommittee on Drug Abuse', classification='committee', parent_id={'name' : 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services'}) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o
def scrape_session(self, session, chambers): sid = SESSION_SITE_IDS[session] members = backoff( self.sservice.GetMembersBySession, sid )['MemberListing'] for member in members: guid = member['Id'] member_info = backoff(self.sservice.GetMember, guid) # Check to see if the member has vacated; skip if so: try: legislative_service = next( service for service in member_info['SessionsInService']['LegislativeService'] if service['Session']['Id'] == sid ) except IndexError: raise Exception("Something very bad is going on with the " "Legislative service") if legislative_service['DateVacated']: continue nick_name, first_name, middle_name, last_name = ( member_info['Name'][x] for x in [ 'Nickname', 'First', 'Middle', 'Last' ] ) first_name = nick_name if nick_name else first_name if middle_name: full_name = "%s %s %s" % (first_name, middle_name, last_name) else: full_name = "%s %s" % (first_name, last_name) party = legislative_service['Party'] if party == 'Democrat': party = 'Democratic' elif party.strip() == '': party = 'other' chamber, district = ( legislative_service['District'][x] for x in [ 'Type', 'Number' ] ) chamber = { "House": 'lower', "Senate": 'upper' }[chamber] url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber], {"code": guid, "sid": sid}) legislator = Person( name=full_name, district=str(district), party=party, primary_org=chamber, image=photo, ) legislator.extras = { 'last_name': last_name, 'first_name': first_name, 'guid': guid, } capitol_address = self.clean_list([ member_info['Address'][x] for x in [ 'Street', 'City', 'State', 'Zip' ] ]) capitol_address = " ".join( addr_component for addr_component in capitol_address if addr_component ).strip() capitol_contact_info = self.clean_list([ member_info['Address'][x] for x in [ 'Email', 'Phone', 'Fax' ] ]) # Sometimes email is set to a long cryptic string. # If it doesn't have a @ character, simply set it to None # examples: # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI= # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8= if capitol_contact_info[0] and '@' not in capitol_contact_info[0]: capitol_contact_info[0] = None # if we have more than 2 chars (eg state) # or a phone/fax/email address record the info if len(capitol_address) > 2 or not capitol_contact_info.count(None) == 3: if capitol_contact_info[0] and '*****@*****.**' in capitol_contact_info[0]: self.warning("XXX: GA SITE WAS HACKED.") capitol_contact_info[1] = None if capitol_address.strip(): legislator.add_contact_detail( type='address', value=capitol_address, note='Capitol Address') if capitol_contact_info[1]: legislator.add_contact_detail( type='voice', value=capitol_contact_info[1], note='Capitol Address') if capitol_contact_info[2]: legislator.add_contact_detail( type='fax', value=capitol_contact_info[2], note='Capitol Address') if capitol_contact_info[0]: legislator.add_contact_detail( type='email', value=capitol_contact_info[0], note='Capitol Address') district_address = self.clean_list([ member_info['DistrictAddress'][x] for x in [ 'Street', 'City', 'State', 'Zip' ] ]) district_contact_info = self.clean_list([ member_info['DistrictAddress'][x] for x in [ 'Email', 'Phone', 'Fax' ] ]) # Same issue with district email. See above comment if district_contact_info[0] and '@' not in district_contact_info[0]: district_contact_info[0] = None district_address = " ".join( addr_component for addr_component in district_address if addr_component ).strip() if len(capitol_address) > 2 or not capitol_contact_info.count(None) == 3: if (district_contact_info[1] and '*****@*****.**' in district_contact_info[1]): self.warning("XXX: GA SITE WAS HACKED.") district_contact_info[1] = None if district_address.strip(): legislator.add_contact_detail( type='address', value=district_address, note='District Address') if district_contact_info[1]: legislator.add_contact_detail( type='voice', value=district_contact_info[1], note='District Address') if district_contact_info[2]: legislator.add_contact_detail( type='fax', value=district_contact_info[2], note='District Address') if district_contact_info[0]: legislator.add_contact_detail( type='email', value=district_contact_info[0], note='District Address') legislator.add_link(url) legislator.add_source(self.ssource) legislator.add_source(HOMEPAGE_URLS[chamber].format( **{"code": guid, "sid": sid})) yield legislator
def _parse_person(self, row, chamber, seat_map): # Capture legislator vitals. first_name = row['FirstName'] middle_name = row['MiddleName'] last_name = row['LastName'] full_name = '{} {} {}'.format(first_name, middle_name, last_name) full_name = re.sub(r'[\s]{2,}', ' ', full_name) if chamber == 'lower': district = '{} {}'.format(row['County'], int(row['District'])).strip() else: district = str(int(row['District'])).strip() party = self.party_map[row['party'].upper()] email = row['WorkEmail'] if district == '0': self.warning('Skipping {}, district is set to 0'.format(full_name)) return # Temporary fix for Kari Lerner if district == 'Rockingham 0' and last_name == 'Lerner': district = 'Rockingham 4' person = Person(primary_org=chamber, district=district, name=full_name, party=party) extras = { 'first_name': first_name, 'middle_name': middle_name, 'last_name': last_name } person.extras = extras if email: person.add_contact_detail(type='email', value=email, note='District Office') # Capture legislator office contact information. district_address = '{}\n{}\n{}, {} {}'.format(row['Address'], row['address2'], row['city'], row['State'], row['Zipcode']).strip() phone = row['Phone'].strip() if not phone: phone = None if district_address: person.add_contact_detail(type='address', value=district_address, note='Home Office') if phone: person.add_contact_detail(type='voice', value=phone, note='Home Office') # Retrieve legislator portrait. profile_url = None if chamber == 'upper': profile_url = self.senate_profile_url.format(row['District']) elif chamber == 'lower': try: seat_number = seat_map[row['seatno']] profile_url = self.house_profile_url.format(seat_number) except KeyError: pass if profile_url: person.image = self._get_photo(profile_url, chamber) person.add_source(profile_url) return person
def _parse_person(self, row, chamber, seat_map): # Capture legislator vitals. first_name = row["FirstName"] middle_name = row["MiddleName"] last_name = row["LastName"] full_name = "{} {} {}".format(first_name, middle_name, last_name) full_name = re.sub(r"[\s]{2,}", " ", full_name) if chamber == "lower": district = "{} {}".format(row["County"], int(row["District"])).strip() else: district = str(int(row["District"])).strip() party = self.party_map[row["party"].upper()] email = row["WorkEmail"] if district == "0": self.warning("Skipping {}, district is set to 0".format(full_name)) return person = Person(primary_org=chamber, district=district, name=full_name, party=party) extras = { "first_name": first_name, "middle_name": middle_name, "last_name": last_name, } person.extras = extras if email: office = "Capitol" if email.endswith( "@leg.state.nh.us") else "District" person.add_contact_detail(type="email", value=email, note=office + " Office") # Capture legislator office contact information. district_address = "{}\n{}\n{}, {} {}".format(row["Address"], row["address2"], row["city"], row["State"], row["Zipcode"]).strip() phone = row["Phone"].strip() if not phone: phone = None if district_address: office = "Capitol" if chamber == "upper" else "District" person.add_contact_detail(type="address", value=district_address, note=office + " Office") if phone: office = "Capitol" if "271-" in phone else "District" person.add_contact_detail(type="voice", value=phone, note=office + " Office") # Retrieve legislator portrait. profile_url = None if chamber == "upper": profile_url = self.senate_profile_url.format(row["District"]) elif chamber == "lower": try: seat_number = seat_map[row["seatno"]] profile_url = self.house_profile_url.format(seat_number) except KeyError: pass if profile_url: person.image = self._get_photo(profile_url, chamber) person.add_source(profile_url) return person
def scrape_session(self, session, chambers): sid = SESSION_SITE_IDS[session] members = backoff(self.sservice.GetMembersBySession, sid)['MemberListing'] for member in members: guid = member['Id'] member_info = backoff(self.sservice.GetMember, guid) # Check to see if the member has vacated; skip if so: try: legislative_service = next( service for service in member_info['SessionsInService'] ['LegislativeService'] if service['Session']['Id'] == sid) except IndexError: raise Exception("Something very bad is going on with the " "Legislative service") if legislative_service['DateVacated']: continue nick_name, first_name, middle_name, last_name = ( member_info['Name'][x] for x in ['Nickname', 'First', 'Middle', 'Last']) first_name = nick_name if nick_name else first_name if middle_name: full_name = "%s %s %s" % (first_name, middle_name, last_name) else: full_name = "%s %s" % (first_name, last_name) party = legislative_service['Party'] if party == 'Democrat': party = 'Democratic' elif party.strip() == '': party = 'other' chamber, district = (legislative_service['District'][x] for x in ['Type', 'Number']) chamber = {"House": 'lower', "Senate": 'upper'}[chamber] url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber], { "code": guid, "sid": sid }) legislator = Person( name=full_name, district=str(district), party=party, primary_org=chamber, image=photo, ) legislator.extras = { 'last_name': last_name, 'first_name': first_name, 'guid': guid, } capitol_address = self.clean_list([ member_info['Address'][x] for x in ['Street', 'City', 'State', 'Zip'] ]) capitol_address = " ".join(addr_component for addr_component in capitol_address if addr_component).strip() capitol_contact_info = self.clean_list( [member_info['Address'][x] for x in ['Email', 'Phone', 'Fax']]) # Sometimes email is set to a long cryptic string. # If it doesn't have a @ character, simply set it to None # examples: # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI= # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8= if capitol_contact_info[0] and '@' not in capitol_contact_info[0]: capitol_contact_info[0] = None # if we have more than 2 chars (eg state) # or a phone/fax/email address record the info if len(capitol_address) > 2 or not capitol_contact_info.count( None) == 3: if capitol_contact_info[ 0] and '*****@*****.**' in capitol_contact_info[ 0]: self.warning("XXX: GA SITE WAS HACKED.") capitol_contact_info[1] = None if capitol_address.strip(): legislator.add_contact_detail(type='address', value=capitol_address, note='Capitol Address') if capitol_contact_info[1]: legislator.add_contact_detail( type='voice', value=capitol_contact_info[1], note='Capitol Address') if capitol_contact_info[2]: legislator.add_contact_detail( type='fax', value=capitol_contact_info[2], note='Capitol Address') if capitol_contact_info[0]: legislator.add_contact_detail( type='email', value=capitol_contact_info[0], note='Capitol Address') district_address = self.clean_list([ member_info['DistrictAddress'][x] for x in ['Street', 'City', 'State', 'Zip'] ]) district_contact_info = self.clean_list([ member_info['DistrictAddress'][x] for x in ['Email', 'Phone', 'Fax'] ]) # Same issue with district email. See above comment if district_contact_info[0] and '@' not in district_contact_info[0]: district_contact_info[0] = None district_address = " ".join(addr_component for addr_component in district_address if addr_component).strip() if len(capitol_address) > 2 or not capitol_contact_info.count( None) == 3: if (district_contact_info[1] and '*****@*****.**' in district_contact_info[1]): self.warning("XXX: GA SITE WAS HACKED.") district_contact_info[1] = None if district_address.strip(): legislator.add_contact_detail(type='address', value=district_address, note='District Address') if district_contact_info[1]: legislator.add_contact_detail( type='voice', value=district_contact_info[1], note='District Address') if district_contact_info[2]: legislator.add_contact_detail( type='fax', value=district_contact_info[2], note='District Address') if district_contact_info[0]: legislator.add_contact_detail( type='email', value=district_contact_info[0], note='District Address') legislator.add_link(url) legislator.add_source(self.ssource) legislator.add_source(HOMEPAGE_URLS[chamber].format(**{ "code": guid, "sid": sid })) yield legislator
def transform_parse(self, parsed_form, response): _source = { "url": response.url, "note": "LDA Form LD-1" } # basic disclosure fields _disclosure = Disclosure( effective_date=datetime.strptime( parsed_form['datetimes']['effective_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), timezone='America/New_York', submitted_date=datetime.strptime( parsed_form['datetimes']['signature_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), classification="lobbying" ) _disclosure.add_authority(name=self.authority.name, type=self.authority._type, id=self.authority._id) _disclosure.add_identifier( identifier=parsed_form['_meta']['document_id'], scheme="urn:sopr:filing" ) # disclosure extras _disclosure.extras = {} _disclosure.extras['registrant'] = { 'self_employed_individual': parsed_form['registrant']['self_employed_individual'], 'general_description': parsed_form['registrant']['registrant_general_description'], 'signature': { "signature_date": parsed_form['datetimes']['signature_date'], "signature": parsed_form['signature'] } } _disclosure.extras['client'] = { 'same_as_registrant': parsed_form['client']['client_self'], 'general_description': parsed_form['client']['client_general_description'] } _disclosure.extras['registration_type'] = { 'is_amendment': parsed_form['registration_type']['is_amendment'], 'new_registrant': parsed_form['registration_type']['new_registrant'], 'new_client_for_existing_registrant': parsed_form['registration_type'][ 'new_client_for_existing_registrant'], } # # Registrant # build registrant _registrant_self_employment = None if parsed_form['registrant']['self_employed_individual']: n = ' '.join([p for p in [ parsed_form['registrant']['registrant_individual_prefix'], parsed_form['registrant']['registrant_individual_firstname'], parsed_form['registrant']['registrant_individual_lastname'] ] if len(p) > 0]).strip() _registrant = Person( name=n, source_identified=True ) _registrant_self_employment = Organization( name='SELF-EMPLOYMENT of {n}'.format(n=n), classification='company', source_identified=True ) _registrant.add_membership( organization=_registrant_self_employment, role='self_employed', label='self-employment of {n}'.format(n=n), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: _registrant = Organization( name=parsed_form['registrant']['registrant_org_name'], classification='company', source_identified=True ) if len(parsed_form['registrant']['registrant_house_id']) > 0: _registrant.add_identifier( identifier=parsed_form['registrant']['registrant_house_id'], scheme='urn:house_clerk:registrant' ) if len(parsed_form['registrant']['registrant_senate_id']) > 0: _registrant.add_identifier( identifier=parsed_form['registrant']['registrant_senate_id'], scheme='urn:sopr:registrant' ) registrant_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ parsed_form['registrant']['registrant_address_one'], parsed_form['registrant']['registrant_address_two'], parsed_form['registrant']['registrant_city'], parsed_form['registrant']['registrant_state'], parsed_form['registrant']['registrant_zip'], parsed_form['registrant']['registrant_country']] if len(p) > 0]).strip(), }, { "type": "voice", "note": "contact phone", "value": parsed_form['registrant']['registrant_contact_phone'], }, { "type": "email", "note": "contact email", "value": parsed_form['registrant']['registrant_contact_email'], }, ] registrant_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ parsed_form['registrant']['registrant_ppb_city'], parsed_form['registrant']['registrant_ppb_state'], parsed_form['registrant']['registrant_ppb_zip'], parsed_form['registrant']['registrant_ppb_country']] if len(p) > 0]).strip(), } if registrant_contact_ppb["value"]: registrant_contact_details.append(registrant_contact_ppb) for cd in registrant_contact_details: _registrant.add_contact_detail(**cd) _registrant.extras = { "contact_details_structured": [ { "type": "address", "note": "contact address", "parts": [ { "note": "address_one", "value": parsed_form['registrant'][ 'registrant_address_one'], }, { "note": "address_two", "value": parsed_form['registrant'][ 'registrant_address_two'], }, { "note": "city", "value": parsed_form['registrant'][ 'registrant_city'], }, { "note": "state", "value": parsed_form['registrant'][ 'registrant_state'], }, { "note": "zip", "value": parsed_form['registrant'][ 'registrant_zip'], }, { "note": "country", "value": parsed_form['registrant'][ 'registrant_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": parsed_form['registrant'][ 'registrant_ppb_city'], }, { "note": "state", "value": parsed_form['registrant'][ 'registrant_ppb_state'], }, { "note": "zip", "value": parsed_form['registrant'][ 'registrant_ppb_zip'], }, { "note": "country", "value": parsed_form['registrant'][ 'registrant_ppb_country'], } ], }, ] } # # People # build contact _main_contact = Person( name=parsed_form['registrant']['registrant_contact_name'], source_identified=True ) main_contact_contact_details = [ { "type": "voice", "note": "contact phone", "value": parsed_form['registrant']['registrant_contact_phone'], }, { "type": "email", "note": "contact email", "value": parsed_form['registrant']['registrant_contact_email'], } ] for cd in main_contact_contact_details: _main_contact.add_contact_detail(**cd) if _registrant._type == 'organization': _registrant.add_member( name_or_person=_main_contact, role='main_contact', label='main contact for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: _registrant_self_employment.add_member( name_or_person=_main_contact, role='main_contact', label='main contact for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) # # Client # build client _client = Organization( name=parsed_form['client']['client_name'], classification='company', source_identified=True ) client_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ parsed_form['client']['client_address'], parsed_form['client']['client_city'], parsed_form['client']['client_state'], parsed_form['client']['client_zip'], parsed_form['client']['client_country']] if len(p) > 0]).strip(), }, ] client_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ parsed_form['client']['client_ppb_city'], parsed_form['client']['client_ppb_state'], parsed_form['client']['client_ppb_zip'], parsed_form['client']['client_ppb_country']] if len(p) > 0]).strip(), } if client_contact_ppb["value"]: client_contact_details.append(client_contact_ppb) for cd in client_contact_details: _client.add_contact_detail(**cd) _client.extras = { "contact_details_structured": [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": parsed_form['client']['client_address'], }, { "note": "city", "value": parsed_form['client']['client_city'], }, { "note": "state", "value": parsed_form['client']['client_state'], }, { "note": "zip", "value": parsed_form['client']['client_zip'], }, { "note": "country", "value": parsed_form['client']['client_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": parsed_form['client']['client_ppb_city'], }, { "note": "state", "value": parsed_form['client']['client_ppb_state'], }, { "note": "zip", "value": parsed_form['client']['client_ppb_zip'], }, { "note": "country", "value": parsed_form['client'][ 'client_ppb_country'], } ], }, ], } # Collect Foreign Entities _foreign_entities = [] _foreign_entities_by_name = {} for fe in parsed_form['foreign_entities']: fe_extras = {} fe_name = fe['foreign_entity_name'] # check for name-based duplicates if fe_name in _foreign_entities_by_name: _foreign_entity = _foreign_entities_by_name[fe_name] else: _foreign_entity = Organization( name=fe_name, classification='company', source_identified=True ) # collect contact details foreign_entity_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ fe['foreign_entity_address'], fe['foreign_entity_city'], fe['foreign_entity_state'], fe['foreign_entity_country']] if len(p) > 0]).strip(), }, { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ fe['foreign_entity_ppb_state'], fe['foreign_entity_ppb_country']] if len(p) > 0]).strip(), }, ] foreign_entity_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ fe['foreign_entity_ppb_city'], fe['foreign_entity_ppb_state'], fe['foreign_entity_ppb_country']] if len(p) > 0]), } if foreign_entity_contact_ppb["value"]: foreign_entity_contact_details.append( foreign_entity_contact_ppb) # add contact details for cd in foreign_entity_contact_details: if cd['value'] != '': _foreign_entity.add_contact_detail(**cd) # add extras fe_extras["contact_details_structured"] = [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": fe['foreign_entity_address'], }, { "note": "city", "value": fe['foreign_entity_city'], }, { "note": "state", "value": fe['foreign_entity_state'], }, { "note": "country", "value": fe['foreign_entity_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "state", "value": fe['foreign_entity_ppb_state'], }, { "note": "country", "value": fe['foreign_entity_ppb_country'], } ], }, ] _foreign_entity.extras = combine_dicts(_foreign_entity.extras, fe_extras) _foreign_entities_by_name[fe_name] = _foreign_entity for unique_foreign_entity in _foreign_entities_by_name.values(): _foreign_entities.append(unique_foreign_entity) # TODO: add a variant on memberships to represent inter-org # relationships (associations, ownership, etc) # # _client['memberships'].append({ # "id": _foreign_entity['id'], # "classification": "organization", # "name": _foreign_entity['name'], # "extras": { # "ownership_percentage": # fe['foreign_entity_amount'] # } # }) # Collect Lobbyists # TODO: deal with wierd non-name line continuation cases (blanks, "continued") _lobbyists_by_name = {} for l in parsed_form['lobbyists']: l_extras = {} l_name = ' '.join([l['lobbyist_first_name'], l['lobbyist_last_name'], l['lobbyist_suffix'] ]).strip() if l_name in _lobbyists_by_name: _lobbyist = _lobbyists_by_name[l_name] else: _lobbyist = Person( name=l_name, source_identified=True ) if l['lobbyist_covered_official_position']: l_extras['lda_covered_official_positions'] = [ { 'date_reported': parsed_form['datetimes']['effective_date'], 'covered_official_position': l['lobbyist_covered_official_position'] }, ] _lobbyist.extras = combine_dicts(_lobbyist.extras, l_extras) _lobbyists_by_name[l_name] = _lobbyist _lobbyists = [] for unique_lobbyist in _lobbyists_by_name.values(): _lobbyists.append(unique_lobbyist) if _registrant._type == 'organization': for l in _lobbyists: _registrant.add_member( l, role='lobbyist', label='lobbyist for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: for l in _lobbyists: _registrant_self_employment.add_member( l, role='lobbyist', label='lobbyist for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) # # Document # build document _disclosure.add_document( note='submitted filing', date=parsed_form['datetimes']['effective_date'][:10], url=response.url ) # Collect Affiliated orgs _affiliated_organizations = [] _affiliated_organizations_by_name = {} for ao in parsed_form['affiliated_organizations']: ao_extras = {} ao_name = ao['affiliated_organization_name'] if ao_name in _affiliated_organizations_by_name: # There's already one by this name _affiliated_organization = _affiliated_organizations_by_name[ao_name] else: # New affiliated org _affiliated_organization = Organization( name=ao_name, classification='company', source_identified=True ) # collect contact details affiliated_organization_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ ao['affiliated_organization_address'], ao['affiliated_organization_city'], ao['affiliated_organization_state'], ao['affiliated_organization_zip'], ao['affiliated_organization_country']] if len(p) > 0]).strip(), }, ] affiliated_organization_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ ao['affiliated_organization_ppb_city'], ao['affiliated_organization_ppb_state'], ao['affiliated_organization_ppb_country']] if len(p) > 0]).strip(), } if affiliated_organization_contact_ppb["value"]: affiliated_organization_contact_details.append( affiliated_organization_contact_ppb) # add contact details for cd in affiliated_organization_contact_details: _affiliated_organization.add_contact_detail(**cd) ao_extras["contact_details_structured"] = [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": ao['affiliated_organization_address'], }, { "note": "city", "value": ao['affiliated_organization_city'], }, { "note": "state", "value": ao['affiliated_organization_state'], }, { "note": "zip", "value": ao['affiliated_organization_zip'], }, { "note": "country", "value": ao['affiliated_organization_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": ao['affiliated_organization_ppb_city'], }, { "note": "state", "value": ao['affiliated_organization_ppb_state'], }, { "note": "country", "value": ao['affiliated_organization_ppb_country'], } ], }, ], _affiliated_organization.extras = combine_dicts( _affiliated_organization.extras, ao_extras) for unique_affiliated_organization in _affiliated_organizations_by_name.values(): _affiliated_organizations.append(unique_affiliated_organization) # # Events & Agendas # name if parsed_form['registration_type']['new_registrant']: registration_type = 'New Client, New Registrant' elif parsed_form['registration_type']['is_amendment']: registration_type = 'Amended Registration' else: registration_type = 'New Client for Existing Registrant' # Create registration event _event = Event( name="{rn} - {rt}, {cn}".format(rn=_registrant.name, rt=registration_type, cn=_client.name), timezone='America/New_York', location='United States', start_time=datetime.strptime( parsed_form['datetimes']['effective_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), classification='registration' ) # add participants _event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") if _registrant._type == 'person': _event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") _event.add_participant(type=_client._type, id=_client._id, name=_client.name, note="client") for l in _lobbyists: _event.add_participant(type=l._type, id=l._id, name=l.name, note='lobbyist') for fe in _foreign_entities: _event.add_participant(type=fe._type, id=fe._id, name=fe.name, note='foreign_entity') for ao in _affiliated_organizations: _event.add_participant(type=ao._type, id=ao._id, name=ao.name, note='affiliated_organization') # add agenda item _agenda = _event.add_agenda_item( description='issues lobbied on', ) _agenda['notes'].append( parsed_form['lobbying_issues_detail'] ) for li in parsed_form['lobbying_issues']: if li['general_issue_area'] != '': _agenda.add_subject(li['general_issue_area']) _disclosure.add_disclosed_event( name=_event.name, type=_event._type, classification=_event.classification, id=_event._id ) # add registrant to disclosure's _related and related_entities fields _disclosure.add_registrant(name=_registrant.name, type=_registrant._type, id=_registrant._id) _registrant.add_source( url=_source['url'], note='registrant' ) yield _registrant if _registrant_self_employment is not None: _registrant_self_employment.add_source( url=_source['url'], note='registrant_self_employment' ) yield _registrant_self_employment _client.add_source( url=_source['url'], note='client' ) yield _client _main_contact.add_source( url=_source['url'], note='main_contact' ) yield _main_contact for ao in _affiliated_organizations: ao.add_source( url=_source['url'], note='affiliated_organization' ) yield ao for fe in _foreign_entities: fe.add_source( url=_source['url'], note='foreign_entity' ) yield fe for l in _lobbyists: l.add_source( url=_source['url'], note='lobbyist' ) yield l _event.add_source(**_source) yield _event _disclosure.add_source(**_source) yield _disclosure
def _scrape_lower_chamber(self): self.info("Scraping lower chamber for legislators.") chamber = "lower" roster_url = self._reps_url page = self.get(roster_url).text page = lxml.html.fromstring(page) # This is the ASP.net table container table_xpath = "//table[@id='theTable']" table = page.xpath(table_xpath)[0] for tr in table.xpath("tr")[3:]: # If a given term hasn't occurred yet, then ignore it # Eg, in 2017, the 2018 term page will have a blank table if tr.attrib.get("class") == "dxgvEmptyDataRow": self.warning("No House members found") return tds = tr.xpath("td") last_name = tds[1].text_content().strip() first_name = tds[2].text_content().strip() full_name = "{} {}".format(first_name, last_name) district = str(int(tds[3].text_content().strip())) party = tds[4].text_content().strip() if party == "D": party = "Democratic" elif party == "R": party = "Republican" if party.strip() == "": # Workaround for now. party = "Other" phone = tds[6].text_content().strip() room = tds[7].text_content().strip() address = self._assumed_address_fmt.format(room if room else "") if last_name == "Vacant": person = Person(name=full_name, primary_org=chamber, district=district, party=party) person.extras = { "first_name": first_name, "last_name": last_name } person.add_contact_detail(type="address", value=address, note="Capitol Office") if phone.strip(): person.add_contact_detail(type="voice", value=phone, note="Capitol Office") person.add_source(roster_url) self._save_vacant_legislator(person) else: party_override = { " Green": "Democratic", " Sisco": "Republican" } if party == "" and full_name in party_override: party = party_override[full_name] details_url = self._rep_details_url.format(district) details_page = lxml.html.fromstring(self.get(details_url).text) person = Person(name=full_name, primary_org=chamber, district=district, party=party) person.extras = { "first_name": first_name, "last_name": last_name } person.add_source(roster_url) person.add_source(details_url) person.add_link(details_url) email = details_page.xpath( '//*[@id="ContentPlaceHolder1_lblAddresses"] ' '//a[starts-with(@href,"mailto:")]/@href') if len(email) > 0 and email[0].lower() != "mailto:": email = email[0].split(":")[1] else: email = None person.add_contact_detail(type="address", value=address, note="Capitol Office") if phone: person.add_contact_detail(type="voice", value=phone, note="Capitol Office") if email: person.add_contact_detail(type="email", value=email, note="Capitol Office") picture = details_page.xpath( '//*[@id="ContentPlaceHolder1_imgPhoto"]/@src') if len(picture) > 0: person.image = picture[0] yield person
def scrape_session(self, session, chambers): sid = SESSION_SITE_IDS[session] members = backoff(self.sservice.GetMembersBySession, sid)["MemberListing"] seen_guids = [] for member in members: guid = member["Id"] member_info = backoff(self.sservice.GetMember, guid) # If a member switches chambers during the session, they may # appear twice. Skip the duplicate record accordingly. if guid in seen_guids: self.warning("Skipping duplicate record of {}".format( member_info["Name"]["Last"])) continue else: seen_guids.append(guid) # Check to see if the member has vacated; skip if so. # A member can have multiple services for a given session, # if they switched chambers. Filter these down to just the # active service. try: (legislative_service, ) = [ service for service in member_info["SessionsInService"] ["LegislativeService"] if service["Session"]["Id"] == sid and service["DateVacated"] is None ] except ValueError: self.info("Skipping retired member {}".format( member_info["Name"]["Last"])) continue nick_name, first_name, middle_name, last_name = ( member_info["Name"][x] for x in ["Nickname", "First", "Middle", "Last"]) first_name = nick_name if nick_name else first_name if middle_name: full_name = "%s %s %s" % (first_name, middle_name, last_name) else: full_name = "%s %s" % (first_name, last_name) party = legislative_service["Party"] if party == "Democrat": party = "Democratic" elif party.strip() == "": party = "other" chamber, district = (legislative_service["District"][x] for x in ["Type", "Number"]) chamber = {"House": "lower", "Senate": "upper"}[chamber] url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber], { "code": guid, "sid": sid }) legislator = Person( name=full_name, district=str(district), party=party, primary_org=chamber, image=photo, ) legislator.extras = { "family_name": last_name, "given_name": first_name, "guid": guid, } if (member_info["Address"]["Street"] is not None and member_info["Address"]["Street"].strip()): capitol_address_info = { k: v.strip() for k, v in dict(member_info["Address"]).items() if k in ["Street", "City", "State", "Zip"] } capitol_address = "{Street}\n{City}, {State} {Zip}".format( **capitol_address_info) legislator.add_contact_detail(type="address", value=capitol_address, note="Capitol Address") else: self.warning( "Could not find full capitol address for {}".format( full_name)) capitol_contact_info = self.clean_list( [member_info["Address"][x] for x in ["Email", "Phone", "Fax"]]) # Sometimes email is set to a long cryptic string. # If it doesn't have a @ character, simply set it to None # examples: # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI= # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8= if capitol_contact_info[0] and "@" not in capitol_contact_info[0]: capitol_contact_info[0] = None if capitol_contact_info[0]: # Site was hacked in the past assert "*****@*****.**" not in capitol_contact_info[0] if capitol_contact_info[1]: legislator.add_contact_detail(type="voice", value=capitol_contact_info[1], note="Capitol Address") if capitol_contact_info[2]: legislator.add_contact_detail(type="fax", value=capitol_contact_info[2], note="Capitol Address") if capitol_contact_info[0]: legislator.add_contact_detail(type="email", value=capitol_contact_info[0], note="Capitol Address") if (member_info["DistrictAddress"]["Street"] is not None and member_info["DistrictAddress"]["Street"].strip()): district_address_info = { k: v.strip() for k, v in dict(member_info["DistrictAddress"]).items() if k in ["Street", "City", "State", "Zip"] } district_address = "{Street}\n{City}, {State} {Zip}".format( **district_address_info) legislator.add_contact_detail(type="address", value=district_address, note="District Address") else: self.warning( "Could not find full district address for {}".format( full_name)) district_contact_info = self.clean_list([ member_info["DistrictAddress"][x] for x in ["Email", "Phone", "Fax"] ]) # Same issue with district email. See above comment if district_contact_info[0] and "@" not in district_contact_info[0]: district_contact_info[0] = None if district_contact_info[0]: # Site was hacked in the past assert "*****@*****.**" not in district_contact_info[0] if district_contact_info[1]: legislator.add_contact_detail( type="voice", value=district_contact_info[1], note="District Address", ) if district_contact_info[2]: legislator.add_contact_detail(type="fax", value=district_contact_info[2], note="District Address") if district_contact_info[0]: legislator.add_contact_detail( type="email", value=district_contact_info[0], note="District Address", ) legislator.add_link(url) legislator.add_source(self.ssource) legislator.add_source(HOMEPAGE_URLS[chamber].format(**{ "code": guid, "sid": sid })) yield legislator
def _scrape_lower_chamber(self): self.info('Scraping lower chamber for legislators.') chamber = 'lower' roster_url = (self._reps_url) page = self.get(roster_url).text page = lxml.html.fromstring(page) # This is the ASP.net table container table_xpath = ('id("ContentPlaceHolder1_' 'gridMembers_DXMainTable")') table = page.xpath(table_xpath)[0] for tr in table.xpath('tr')[1:]: # If a given term hasn't occurred yet, then ignore it # Eg, in 2017, the 2018 term page will have a blank table if tr.attrib.get('class') == 'dxgvEmptyDataRow': self.warning('No House members found') return tds = tr.xpath('td') last_name = tds[0].text_content().strip() first_name = tds[1].text_content().strip() full_name = '{} {}'.format(first_name, last_name) district = str(int(tds[2].text_content().strip())) party = tds[3].text_content().strip() if party == 'Democrat': party = 'Democratic' if party.strip() == "": # Workaround for now. party = "Other" phone = tds[4].text_content().strip() room = tds[5].text_content().strip() address = self._assumed_address_fmt.format(room if room else '') if last_name == 'Vacant': person = Person( name=full_name, primary_org=chamber, district=district, party=party, ) person.extras = { 'first_name': first_name, 'last_name': last_name, } person.add_contact_detail(type='address', value=address, note='Capitol Office') if phone.strip(): person.add_contact_detail(type='voice', value=phone, note='Capitol Office') person.add_source(roster_url) self._save_vacant_legislator(person) else: party_override = {" Green": "Democratic", " Sisco": "Republican"} if party == "" and full_name in party_override: party = party_override[full_name] details_url = self._rep_details_url.format(district) details_page = lxml.html.fromstring(self.get(details_url).text) person = Person( name=full_name, primary_org=chamber, district=district, party=party, ) person.extras = { 'first_name': first_name, 'last_name': last_name, } person.add_source(roster_url) person.add_source(details_url) person.add_link(details_url) email = details_page.xpath( '//*[@id="ContentPlaceHolder1_lblAddresses"]' '/table/tr[4]/td/a/@href' ) if len(email) > 0 and email[0].lower() != 'mailto:': email = email[0].split(':')[1] else: email = None person.add_contact_detail(type='address', value=address, note='Capitol Office') if phone: person.add_contact_detail(type='voice', value=phone, note='Capitol Office') if email: person.add_contact_detail(type='email', value=email, note='Capitol Office') picture = details_page.xpath( '//*[@id="ContentPlaceHolder1_imgPhoto"]/@src') if len(picture) > 0: person.image = picture[0] yield person
def scrape_session(self, session, chambers): sid = SESSION_SITE_IDS[session] members = backoff( self.sservice.GetMembersBySession, sid )['MemberListing'] seen_guids = [] for member in members: guid = member['Id'] member_info = backoff(self.sservice.GetMember, guid) # If a member switches chambers during the session, they may # appear twice. Skip the duplicate record accordingly. if guid in seen_guids: self.warning('Skipping duplicate record of {}'.format(member_info['Name']['Last'])) continue else: seen_guids.append(guid) # Check to see if the member has vacated; skip if so. # A member can have multiple services for a given session, # if they switched chambers. Filter these down to just the # active service. try: (legislative_service, ) = [ service for service in member_info['SessionsInService']['LegislativeService'] if service['Session']['Id'] == sid and service['DateVacated'] is None ] except ValueError: self.info('Skipping retired member {}'.format(member_info['Name']['Last'])) continue nick_name, first_name, middle_name, last_name = ( member_info['Name'][x] for x in [ 'Nickname', 'First', 'Middle', 'Last' ] ) first_name = nick_name if nick_name else first_name if middle_name: full_name = "%s %s %s" % (first_name, middle_name, last_name) else: full_name = "%s %s" % (first_name, last_name) party = legislative_service['Party'] if party == 'Democrat': party = 'Democratic' elif party.strip() == '': party = 'other' chamber, district = ( legislative_service['District'][x] for x in [ 'Type', 'Number' ] ) chamber = { "House": 'lower', "Senate": 'upper' }[chamber] url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber], {"code": guid, "sid": sid}) legislator = Person( name=full_name, district=str(district), party=party, primary_org=chamber, image=photo, ) legislator.extras = { 'family_name': last_name, 'given_name': first_name, 'guid': guid, } if member_info['Address']['Street'] is not None and \ member_info['Address']['Street'].strip(): capitol_address_info = { k: v.strip() for k, v in dict(member_info['Address']).items() if k in ['Street', 'City', 'State', 'Zip'] } capitol_address = '{Street}\n{City}, {State} {Zip}'.format(**capitol_address_info) legislator.add_contact_detail( type='address', value=capitol_address, note='Capitol Address') else: self.warning('Could not find full capitol address for {}'.format(full_name)) capitol_contact_info = self.clean_list([ member_info['Address'][x] for x in [ 'Email', 'Phone', 'Fax' ] ]) # Sometimes email is set to a long cryptic string. # If it doesn't have a @ character, simply set it to None # examples: # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI= # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8= if capitol_contact_info[0] and '@' not in capitol_contact_info[0]: capitol_contact_info[0] = None if capitol_contact_info[0]: # Site was hacked in the past assert '*****@*****.**' not in capitol_contact_info[0] if capitol_contact_info[1]: legislator.add_contact_detail( type='voice', value=capitol_contact_info[1], note='Capitol Address') if capitol_contact_info[2]: legislator.add_contact_detail( type='fax', value=capitol_contact_info[2], note='Capitol Address') if capitol_contact_info[0]: legislator.add_contact_detail( type='email', value=capitol_contact_info[0], note='Capitol Address') if member_info['DistrictAddress']['Street'] is not None and \ member_info['DistrictAddress']['Street'].strip(): district_address_info = { k: v.strip() for k, v in dict(member_info['DistrictAddress']).items() if k in ['Street', 'City', 'State', 'Zip'] } district_address = '{Street}\n{City}, {State} {Zip}'.format( **district_address_info) legislator.add_contact_detail( type='address', value=district_address, note='District Address') else: self.warning('Could not find full district address for {}'.format(full_name)) district_contact_info = self.clean_list([ member_info['DistrictAddress'][x] for x in [ 'Email', 'Phone', 'Fax' ] ]) # Same issue with district email. See above comment if district_contact_info[0] and '@' not in district_contact_info[0]: district_contact_info[0] = None if district_contact_info[0]: # Site was hacked in the past assert '*****@*****.**' not in district_contact_info[0] if district_contact_info[1]: legislator.add_contact_detail( type='voice', value=district_contact_info[1], note='District Address') if district_contact_info[2]: legislator.add_contact_detail( type='fax', value=district_contact_info[2], note='District Address') if district_contact_info[0]: legislator.add_contact_detail( type='email', value=district_contact_info[0], note='District Address') legislator.add_link(url) legislator.add_source(self.ssource) legislator.add_source(HOMEPAGE_URLS[chamber].format( **{"code": guid, "sid": sid})) yield legislator
def scrape(self): noncommittees = {'Committee of the Whole'} committee_d = {} people_d = {} # Go to memberlist extra_args = {'ctl00$ContentPlaceHolder$lstName': 'City Council'} for councilman, committees in self.councilMembers( extra_args=extra_args): if 'url' in councilman['Person Name']: councilman_url = councilman['Person Name']['url'] if councilman_url in people_d: people_d[councilman_url][0].append(councilman) else: people_d[councilman_url] = [councilman], committees for person_entries, committees in people_d.values(): councilman = person_entries[-1] p = Person(councilman['Person Name']['label']) if p.name == 'Letitia James': p.name = 'Letitia Ms. James' p.add_name('Letitia James') spans = [(self.toTime(entry['Start Date']).date(), self.toTime(entry['End Date']).date(), entry['District']) for entry in person_entries] merged_spans = [] last_end_date = None last_district = None for start_date, end_date, district in sorted(spans): if last_end_date is None: span = [start_date, end_date, district] elif (start_date - last_end_date ) == datetime.timedelta(1) and district == last_district: span[1] = end_date else: merged_spans.append(span) span = [start_date, end_date, district] last_end_date = end_date last_district = district merged_spans.append(span) for start_date, end_date, district in merged_spans: district = councilman['District'].replace(' 0', ' ') if end_date == datetime.date(2017, 12, 31): end_date = '' else: end_date = end_date.isoformat() print(start_date, end_date) p.add_term('Council Member', 'legislature', district=district, start_date=start_date.isoformat(), end_date=end_date) party = councilman['Political Party'] if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if councilman['Photo']: p.image = councilman['Photo'] if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['url'], note='E-mail') if councilman['Web site']: p.add_link(councilman['Web site']['url'], note='web site') p.extras = {'Notes': councilman['Notes']} p.add_source(councilman['Person Name']['url'], note='web') for committee, _, _ in committees: committee_name = committee['Department Name']['label'] if committee_name not in noncommittees and 'committee' in committee_name.lower( ): o = committee_d.get(committee_name, None) if o is None: parent_id = PARENT_ORGS.get(committee_name, 'New York City Council') o = Organization(committee_name, classification='committee', parent_id={'name': parent_id}) o.add_source(committee['Department Name']['url']) committee_d[committee_name] = o membership = o.add_member(p, role=committee["Title"]) membership.start_date = self.mdY2Ymd( committee["Start Date"]) yield p for o in committee_d.values(): if 'Committee' in o.name: yield o for o in committee_d.values(): if 'Subcommittee' in o.name: yield o o = Organization( 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services', classification='committee', parent_id={'name': 'New York City Council'}) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o o = Organization( 'Subcommittee on Drug Abuse', classification='committee', parent_id={ 'name': 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services' }) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o
def scrape_session(self, session, chambers): sid = SESSION_SITE_IDS[session] members = backoff(self.sservice.GetMembersBySession, sid)['MemberListing'] seen_guids = [] for member in members: guid = member['Id'] member_info = backoff(self.sservice.GetMember, guid) # If a member switches chambers during the session, they may # appear twice. Skip the duplicate record accordingly. if guid in seen_guids: self.warning('Skipping duplicate record of {}'.format( member_info['Name']['Last'])) continue else: seen_guids.append(guid) # Check to see if the member has vacated; skip if so. # A member can have multiple services for a given session, # if they switched chambers. Filter these down to just the # active service. try: (legislative_service, ) = [ service for service in member_info['SessionsInService'] ['LegislativeService'] if service['Session']['Id'] == sid and service['DateVacated'] is None ] except ValueError: self.info('Skipping retired member {}'.format( member_info['Name']['Last'])) continue nick_name, first_name, middle_name, last_name = ( member_info['Name'][x] for x in ['Nickname', 'First', 'Middle', 'Last']) first_name = nick_name if nick_name else first_name if middle_name: full_name = "%s %s %s" % (first_name, middle_name, last_name) else: full_name = "%s %s" % (first_name, last_name) party = legislative_service['Party'] if party == 'Democrat': party = 'Democratic' elif party.strip() == '': party = 'other' chamber, district = (legislative_service['District'][x] for x in ['Type', 'Number']) chamber = {"House": 'lower', "Senate": 'upper'}[chamber] url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber], { "code": guid, "sid": sid }) legislator = Person( name=full_name, district=str(district), party=party, primary_org=chamber, image=photo, ) legislator.extras = { 'last_name': last_name, 'first_name': first_name, 'guid': guid, } if member_info['Address']['Street'] is not None and \ member_info['Address']['Street'].strip(): capitol_address_info = { k: v.strip() for k, v in dict(member_info['Address']).items() if k in ['Street', 'City', 'State', 'Zip'] } capitol_address = '{Street}\n{City}, {State} {Zip}'.format( **capitol_address_info) legislator.add_contact_detail(type='address', value=capitol_address, note='Capitol Address') else: self.warning( 'Could not find full capitol address for {}'.format( full_name)) capitol_contact_info = self.clean_list( [member_info['Address'][x] for x in ['Email', 'Phone', 'Fax']]) # Sometimes email is set to a long cryptic string. # If it doesn't have a @ character, simply set it to None # examples: # 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI= # 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8= if capitol_contact_info[0] and '@' not in capitol_contact_info[0]: capitol_contact_info[0] = None if capitol_contact_info[0]: # Site was hacked in the past assert '*****@*****.**' not in capitol_contact_info[0] if capitol_contact_info[1]: legislator.add_contact_detail(type='voice', value=capitol_contact_info[1], note='Capitol Address') if capitol_contact_info[2]: legislator.add_contact_detail(type='fax', value=capitol_contact_info[2], note='Capitol Address') if capitol_contact_info[0]: legislator.add_contact_detail(type='email', value=capitol_contact_info[0], note='Capitol Address') if member_info['DistrictAddress']['Street'] is not None and \ member_info['DistrictAddress']['Street'].strip(): district_address_info = { k: v.strip() for k, v in dict(member_info['DistrictAddress']).items() if k in ['Street', 'City', 'State', 'Zip'] } district_address = '{Street}\n{City}, {State} {Zip}'.format( **district_address_info) legislator.add_contact_detail(type='address', value=district_address, note='District Address') else: self.warning( 'Could not find full district address for {}'.format( full_name)) district_contact_info = self.clean_list([ member_info['DistrictAddress'][x] for x in ['Email', 'Phone', 'Fax'] ]) # Same issue with district email. See above comment if district_contact_info[0] and '@' not in district_contact_info[0]: district_contact_info[0] = None if district_contact_info[0]: # Site was hacked in the past assert '*****@*****.**' not in district_contact_info[0] if district_contact_info[1]: legislator.add_contact_detail(type='voice', value=district_contact_info[1], note='District Address') if district_contact_info[2]: legislator.add_contact_detail(type='fax', value=district_contact_info[2], note='District Address') if district_contact_info[0]: legislator.add_contact_detail(type='email', value=district_contact_info[0], note='District Address') legislator.add_link(url) legislator.add_source(self.ssource) legislator.add_source(HOMEPAGE_URLS[chamber].format(**{ "code": guid, "sid": sid })) yield legislator