def test_full_person(): person = ScrapePerson('Tom Sawyer') person.add_identifier('1') person.add_name('Tommy', start_date='1880') person.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') person.add_link('http://example.com/link') person.add_source('http://example.com/source') # import person pd = person.as_dict() PersonImporter('jurisdiction-id').import_data([pd]) # get person from db and assert it imported correctly p = Person.objects.get() assert 'ocd-person' in p.id assert p.name == person.name assert p.identifiers.all()[0].identifier == '1' assert p.identifiers.all()[0].scheme == '' assert p.other_names.all()[0].name == 'Tommy' assert p.other_names.all()[0].start_date == '1880' assert p.contact_details.all()[0].type == 'phone' assert p.contact_details.all()[0].value == '555-555-1234' assert p.contact_details.all()[0].note == 'this is fake' assert p.links.all()[0].url == 'http://example.com/link' assert p.sources.all()[0].url == 'http://example.com/source'
def test_bill_sponsor_by_identifier(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_sponsorship_by_identifier(name="SNODGRASS", classification='sponsor', entity_type='person', primary=True, identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") oi = OrganizationImporter('jid') pi = PersonImporter('jid') zs = ScrapePerson(name='Zadock Snodgrass') zs.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') pi.import_data([zs.as_dict()]) za_db = Person.objects.get() Membership.objects.create(person_id=za_db.id, organization_id=org.id) BillImporter('jid', oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry, ) = obj.sponsorships.all() assert entry.person.name == "Zadock Snodgrass"
def test_bill_sponsor_by_identifier(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_sponsorship_by_identifier(name="SNODGRASS", classification='sponsor', entity_type='person', primary=True, identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") oi = OrganizationImporter('jid') pi = PersonImporter('jid') zs = ScrapePerson(name='Zadock Snodgrass') zs.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') pi.import_data([zs.as_dict()]) za_db = Person.objects.get() Membership.objects.create(person_id=za_db.id, organization_id=org.id) BillImporter('jid', oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry,) = obj.sponsorships.all() assert entry.person.name == "Zadock Snodgrass"
def scrape(self): current_path = Path(__file__) legislator_path = current_path.parent / 'congress-legislators/legislators-historical.yaml' with legislator_path.open() as f: legislators = yaml.load(f, Loader=yaml.CLoader) for legislator in legislators: if all(term['end'] < '1970' for term in legislator['terms']): continue l = Person(name=' '.join( (legislator['name']['first'], legislator['name']['last'])), birth_date=legislator['bio'].get('birthday', ''), gender=legislator['bio']['gender']) parties = set() for term in legislator['terms']: state = term['state'] parties.add(term['party']) if term['type'] == 'rep': role = 'Representative' district_name = self._district_name( state, term['district']) chamber = 'lower' else: role = "Senator" district_name = "{state}, Class {klass}".format( state=state, klass=term['class']) chamber = 'upper' l.add_term(role, chamber, district=district_name, start_date=term['start'], end_date=term['end']) for party in parties: l.add_party(party) for scheme, identifier in legislator['id'].items(): l.add_identifier(str(identifier), scheme=scheme) l.add_source( 'https://github.com/unitedstates/congress-legislators/blob/master/legislators-historical.yaml' ) yield l
def test_bill_sponsor_limit_lookup(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_sponsorship_by_identifier(name="SNODGRASS", classification='sponsor', entity_type='person', primary=True, identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") oi = OrganizationImporter('jid') pi = PersonImporter('jid') zs = ScrapePerson(name='Zadock Snodgrass', birth_date="1800-01-01") zs.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') pi.import_data([zs.as_dict()]) za_db = Person.objects.get() Membership.objects.create(person_id=za_db.id, organization_id=org.id) zs2 = ScrapePerson(name='Zadock Snodgrass', birth_date="1900-01-01") zs2.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') # This is contrived and perhaps broken, but we're going to check this. # We *really* don't want to *ever* cross jurisdiction bounds. PersonImporter('another-jurisdiction').import_data([zs.as_dict()]) BillImporter('jid', oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry, ) = obj.sponsorships.all() assert entry.person.name == "Zadock Snodgrass" assert entry.person.birth_date == "1800-01-01"
def test_bill_sponsor_limit_lookup(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_sponsorship_by_identifier(name="SNODGRASS", classification='sponsor', entity_type='person', primary=True, identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") oi = OrganizationImporter('jid') pi = PersonImporter('jid') zs = ScrapePerson(name='Zadock Snodgrass', birth_date="1800-01-01") zs.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') pi.import_data([zs.as_dict()]) za_db = Person.objects.get() Membership.objects.create(person_id=za_db.id, organization_id=org.id) zs2 = ScrapePerson(name='Zadock Snodgrass', birth_date="1900-01-01") zs2.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') # This is contrived and perhaps broken, but we're going to check this. # We *really* don't want to *ever* cross jurisdiction bounds. PersonImporter('another-jurisdiction').import_data([zs.as_dict()]) BillImporter('jid', oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry,) = obj.sponsorships.all() assert entry.person.name == "Zadock Snodgrass" assert entry.person.birth_date == "1800-01-01"
def scrape_legislator(self, legislator_id): old = self.api('legislators/' + legislator_id + '?') # just not needed id = old.pop('id') old.pop('created_at') old.pop('updated_at') old.pop('country', None) old.pop('level', None) old.pop('state') old.pop('leg_id') old.pop('active') # junk keys old.pop('suffix', None) old.pop('notice', None) old.pop('csrfmiddlewaretoken', None) old.pop('office_address', None) old.pop('office_phone', None) # translated district = old.pop('district', None) chamber = old.pop('chamber', None) image = old.pop('photo_url', '') name = old.pop('full_name') party = old.pop('party', None) if party in ('Nonpartisan', 'unknown', 'Unknown', 'Unaffiliated', "Non Affiliated", " "): party = None elif party == 'Democrat': party = 'Democratic' if self.state in('ne', 'dc'): chamber = 'legislature' if chamber == 'upper' and self.state == 'pr': pr_district = { '1': 'I', '2': 'II', '3': 'III', '4': 'IV', '5': 'V', '6': 'VI', '7': 'VII', '8': 'VIII', } if district in pr_district: district = pr_district[district] if '2008-2011' in old: old['old_roles']['2008-2011'] = old.pop('2008-2011') old_roles = old.pop('old_roles', {}) if old['roles'] and 'Lt. Governor' in [x['type'] for x in old['roles']]: new = Person(name=name, district=district, party=party, image=image) self.jurisdiction._executive.add_post( 'Lt. Governor', 'lt-gov' ) membership = Membership( person_id=new._id, role="Lt. Governor", organization_id=self.jurisdiction._executive._id ) new._related.append(membership) else: new = Person(name=name, party=party, image=image) if id in birthdays: new.birth_date = birthdays[id] # various ids id_types = {'votesmart_id': 'votesmart', 'transparencydata_id': 'influence-explorer', 'nimsp_id': 'nimsp', 'nimsp_candidate_id': 'nimsp-candidate', } for idname, scheme in id_types.items(): val = old.pop(idname, None) if val: new.add_identifier(val, scheme=scheme) for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') self._people[id] = new # contact details email = old.pop('email', None) if email: new.add_contact_detail(type='email', value=email, note='') office_keys = {'fax': 'fax', 'phone': 'voice', 'email': 'email', 'address': 'address'} for office in old.pop('offices'): for key, type in office_keys.items(): if office.get(key): if 'Office Hours' in office[key] and self.state == 'pa': for x in office[key].split('Office Hours: '): if x: new.add_contact_detail(type=type, value=x, note=office['name']) else: new.add_contact_detail(type=type, value=office[key], note=office['name']) # links link = old.pop('url', None) if link: new.add_link(link) #for utah, conflict of interest is in links if self.state == 'ut': links = old.pop('+links',[]) for l in links: new.add_link(note="conflict of interest form",url=l) # sources for source in old.pop('sources'): source.pop('retrieved', None) source.pop('+page', None) new.add_source(**source) # roles for role in old.pop('roles'): self.process_role(new, role, leg_id=id) for role_list in old_roles.values(): for role in role_list: self.process_role(new, role, leg_id=id) # ignore most of the names for now old.pop('first_name') old.pop('middle_name') old.pop('suffixes') old.pop('nickname', None) new.sort_name = old.pop('last_name') #some places have legacy names without underscores old.pop('+firstname', None) old.pop('+lastname', None) gender = old.pop('+gender', None) if gender: new.gender = gender biography = old.pop('+biography', None) if biography: new.biography = biography birth_date = old.pop('+birth_date', None) if birth_date: new.birth_date = birth_date # keys to keep to_extras = ['+occupation', '+twitter', '+facebook_url', '+sworn_in_date', '+profession', '+secretary', '+office_hours', '+resident_county', '+district_name', '+leg_status', '+legal_position', '+title', '+start_year', '+end_date', 'occupation', '+oregon_member_id', '+facebook', '+youtube', '+instagram'] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v # keys not to keep to_pop = ['+office_fax', '+phone', '+room', '+fax', '+email', '+url', '+photo', '+notice', '+page', '+suffix', '+city', '+address', '+additional_info_url', '+contact_form', '+fax_number', '+phone_number', '+business_phone', '+email_address', '+img_url', '+office_phone', '+disctict_name', '+office_loc', '+leg_url', '+office', '+district_address', '+capital_address', '+bis_phone', '+capital_phone', '+org_info', '+role', '+other_phone', '+home_phone', '+zip', '+zipcode', '+county', '+capitol_phone', '+image_url', '+header', '+town_represented', '+full_address', '+capitol_address', '+website', '+district_phone', '+district_offices', '+party', '+district', '+capitol_office', '+office_address', ] for k in to_pop: old.pop(k, None) # ensure we got it all assert not old, old.keys() return new
def scrape_current_legislators(self, repos): for repo in repos: CURRENT_LEGISLATORS = self.get_url(repo) people = self.yamlize(CURRENT_LEGISLATORS) parties = set() posts = {} person_cache = defaultdict(lambda: defaultdict(lambda: None)) for person in people: name = person['name'].get('official_full') if name is None: name = "{name[first]} {name[last]}".format(**person) if 'birthday' in person['bio']: birth_date = person['bio']['birthday'] who = person_cache[name][birth_date] has_term = False if who is None: who = Person(name=name, birth_date=birth_date) who.add_source(url=CURRENT_LEGISLATORS, note="unitedstates project on GitHub") for term in person.get('terms', []): has_term = True start_date = term['start'] end_date = term['end'] state = term['state'] type_ = term['type'] district = term.get('district', None) party = term.get('party', None) chamber = {'rep': self.house, 'sen': self.senate}[type_] role = {'rep': 'Representative', 'sen': 'Senator'}[type_] if type_ == "rep" and district is not None: label = "%s for District %s in %s" % (role, district, state) division_id = ("ocd-division/country:us/state:{state}".format(state=state.lower())) if district != 0: division_id += "/cd:{district}".format(district=district) if type_ == "sen": label = "Senator for %s" % state division_id = ("ocd-division/country:us/state:{state}".format(state=state.lower())) post = posts.get(division_id) if post is None: post = Post(organization_id=chamber._id, division_id=division_id, label=label, role=role) posts[division_id] = post yield post membership = Membership( post_id=post._id, role=role, label=label, start_date=start_date, end_date=end_date, person_id=who._id, organization_id=chamber._id) yield membership if party == "Democrat": party = "Democratic" if party: membership = Membership( role='member', start_date=start_date, end_date=end_date, person_id=who._id, organization_id=make_pseudo_id( classification="party", name=party)) yield membership for key, value in person.get('id', {}).items(): if isinstance(value, list): for v in value: who.add_identifier(str(v), scheme=key) else: who.add_identifier(str(value), scheme=key) if key == 'bioguide': who.image = self.get_image_url(str(value)) if has_term: yield who
def scrape_legislator(self, legislator_id): old = self.api('legislators/' + legislator_id + '?') # just not needed id = old.pop('id') old.pop('created_at') old.pop('updated_at') old.pop('country', None) old.pop('level', None) old.pop('state') old.pop('leg_id') old.pop('active') # junk keys old.pop('suffix', None) old.pop('notice', None) old.pop('csrfmiddlewaretoken', None) old.pop('office_address', None) old.pop('office_phone', None) # translated district = old.pop('district', None) chamber = old.pop('chamber', None) image = old.pop('photo_url', '') name = old.pop('full_name') party = old.pop('party', None) if party in ('Nonpartisan', 'unknown', 'Unknown', 'Unaffiliated', "Non Affiliated", " "): party = None elif party == 'Democrat': party = 'Democratic' if self.state in ('ne', 'dc'): chamber = 'legislature' if chamber == 'upper' and self.state == 'pr': pr_district = { '1': 'I', '2': 'II', '3': 'III', '4': 'IV', '5': 'V', '6': 'VI', '7': 'VII', '8': 'VIII', } if district in pr_district: district = pr_district[district] if '2008-2011' in old: old['old_roles']['2008-2011'] = old.pop('2008-2011') old_roles = old.pop('old_roles', {}) if old['roles'] and 'Lt. Governor' in [ x['type'] for x in old['roles'] ]: new = Person(name=name, district=district, party=party, image=image) self.jurisdiction._executive.add_post('Lt. Governor', 'lt-gov') membership = Membership( person_id=new._id, role="Lt. Governor", organization_id=self.jurisdiction._executive._id) new._related.append(membership) else: new = Person(name=name, party=party, image=image) if id in birthdays: new.birth_date = birthdays[id] # various ids id_types = { 'votesmart_id': 'votesmart', 'transparencydata_id': 'influence-explorer', 'nimsp_id': 'nimsp', 'nimsp_candidate_id': 'nimsp-candidate', } for idname, scheme in id_types.items(): val = old.pop(idname, None) if val: new.add_identifier(val, scheme=scheme) for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') self._people[id] = new # contact details email = old.pop('email', None) if email: new.add_contact_detail(type='email', value=email, note='') office_keys = { 'fax': 'fax', 'phone': 'voice', 'email': 'email', 'address': 'address' } for office in old.pop('offices'): for key, type in office_keys.items(): if office.get(key): if 'Office Hours' in office[key] and self.state == 'pa': for x in office[key].split('Office Hours: '): if x: new.add_contact_detail(type=type, value=x, note=office['name']) else: new.add_contact_detail(type=type, value=office[key], note=office['name']) # links link = old.pop('url', None) if link: new.add_link(link) #for utah, conflict of interest is in links if self.state == 'ut': links = old.pop('+links', []) for l in links: new.add_link(note="conflict of interest form", url=l) # sources for source in old.pop('sources'): source.pop('retrieved', None) source.pop('+page', None) new.add_source(**source) # roles for role in old.pop('roles'): self.process_role(new, role, leg_id=id) for role_list in old_roles.values(): for role in role_list: self.process_role(new, role, leg_id=id) # ignore most of the names for now old.pop('first_name') old.pop('middle_name') old.pop('suffixes') old.pop('nickname', None) new.sort_name = old.pop('last_name') #some places have legacy names without underscores old.pop('+firstname', None) old.pop('+lastname', None) gender = old.pop('+gender', None) if gender: new.gender = gender biography = old.pop('+biography', None) if biography: new.biography = biography birth_date = old.pop('+birth_date', None) if birth_date: new.birth_date = birth_date # keys to keep to_extras = [ '+occupation', '+twitter', '+facebook_url', '+sworn_in_date', '+profession', '+secretary', '+office_hours', '+resident_county', '+district_name', '+leg_status', '+legal_position', '+title', '+start_year', '+end_date', 'occupation', '+oregon_member_id', '+facebook', '+youtube', '+instagram' ] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v # keys not to keep to_pop = [ '+office_fax', '+phone', '+room', '+fax', '+email', '+url', '+photo', '+notice', '+page', '+suffix', '+city', '+address', '+additional_info_url', '+contact_form', '+fax_number', '+phone_number', '+business_phone', '+email_address', '+img_url', '+office_phone', '+disctict_name', '+office_loc', '+leg_url', '+office', '+district_address', '+capital_address', '+bis_phone', '+capital_phone', '+org_info', '+role', '+other_phone', '+home_phone', '+zip', '+zipcode', '+county', '+capitol_phone', '+image_url', '+header', '+town_represented', '+full_address', '+capitol_address', '+website', '+district_phone', '+district_offices', '+party', '+district', '+capitol_office', '+office_address', ] for k in to_pop: old.pop(k, None) # ensure we got it all assert not old, old.keys() return new
def transform_parse(self, parsed_form, response): _source = { "url": response.url, "note": "LDA Form LD-1" } # basic disclosure fields _disclosure = Disclosure( effective_date=datetime.strptime( parsed_form['datetimes']['effective_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), timezone='America/New_York', submitted_date=datetime.strptime( parsed_form['datetimes']['signature_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), classification="lobbying" ) _disclosure.add_authority(name=self.authority.name, type=self.authority._type, id=self.authority._id) _disclosure.add_identifier( identifier=parsed_form['_meta']['document_id'], scheme="urn:sopr:filing" ) # disclosure extras _disclosure.extras = {} _disclosure.extras['registrant'] = { 'self_employed_individual': parsed_form['registrant']['self_employed_individual'], 'general_description': parsed_form['registrant']['registrant_general_description'], 'signature': { "signature_date": parsed_form['datetimes']['signature_date'], "signature": parsed_form['signature'] } } _disclosure.extras['client'] = { 'same_as_registrant': parsed_form['client']['client_self'], 'general_description': parsed_form['client']['client_general_description'] } _disclosure.extras['registration_type'] = { 'is_amendment': parsed_form['registration_type']['is_amendment'], 'new_registrant': parsed_form['registration_type']['new_registrant'], 'new_client_for_existing_registrant': parsed_form['registration_type'][ 'new_client_for_existing_registrant'], } # # Registrant # build registrant _registrant_self_employment = None if parsed_form['registrant']['self_employed_individual']: n = ' '.join([p for p in [ parsed_form['registrant']['registrant_individual_prefix'], parsed_form['registrant']['registrant_individual_firstname'], parsed_form['registrant']['registrant_individual_lastname'] ] if len(p) > 0]).strip() _registrant = Person( name=n, source_identified=True ) _registrant_self_employment = Organization( name='SELF-EMPLOYMENT of {n}'.format(n=n), classification='company', source_identified=True ) _registrant.add_membership( organization=_registrant_self_employment, role='self_employed', label='self-employment of {n}'.format(n=n), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: _registrant = Organization( name=parsed_form['registrant']['registrant_org_name'], classification='company', source_identified=True ) if len(parsed_form['registrant']['registrant_house_id']) > 0: _registrant.add_identifier( identifier=parsed_form['registrant']['registrant_house_id'], scheme='urn:house_clerk:registrant' ) if len(parsed_form['registrant']['registrant_senate_id']) > 0: _registrant.add_identifier( identifier=parsed_form['registrant']['registrant_senate_id'], scheme='urn:sopr:registrant' ) registrant_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ parsed_form['registrant']['registrant_address_one'], parsed_form['registrant']['registrant_address_two'], parsed_form['registrant']['registrant_city'], parsed_form['registrant']['registrant_state'], parsed_form['registrant']['registrant_zip'], parsed_form['registrant']['registrant_country']] if len(p) > 0]).strip(), }, { "type": "voice", "note": "contact phone", "value": parsed_form['registrant']['registrant_contact_phone'], }, { "type": "email", "note": "contact email", "value": parsed_form['registrant']['registrant_contact_email'], }, ] registrant_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ parsed_form['registrant']['registrant_ppb_city'], parsed_form['registrant']['registrant_ppb_state'], parsed_form['registrant']['registrant_ppb_zip'], parsed_form['registrant']['registrant_ppb_country']] if len(p) > 0]).strip(), } if registrant_contact_ppb["value"]: registrant_contact_details.append(registrant_contact_ppb) for cd in registrant_contact_details: _registrant.add_contact_detail(**cd) _registrant.extras = { "contact_details_structured": [ { "type": "address", "note": "contact address", "parts": [ { "note": "address_one", "value": parsed_form['registrant'][ 'registrant_address_one'], }, { "note": "address_two", "value": parsed_form['registrant'][ 'registrant_address_two'], }, { "note": "city", "value": parsed_form['registrant'][ 'registrant_city'], }, { "note": "state", "value": parsed_form['registrant'][ 'registrant_state'], }, { "note": "zip", "value": parsed_form['registrant'][ 'registrant_zip'], }, { "note": "country", "value": parsed_form['registrant'][ 'registrant_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": parsed_form['registrant'][ 'registrant_ppb_city'], }, { "note": "state", "value": parsed_form['registrant'][ 'registrant_ppb_state'], }, { "note": "zip", "value": parsed_form['registrant'][ 'registrant_ppb_zip'], }, { "note": "country", "value": parsed_form['registrant'][ 'registrant_ppb_country'], } ], }, ] } # # People # build contact _main_contact = Person( name=parsed_form['registrant']['registrant_contact_name'], source_identified=True ) main_contact_contact_details = [ { "type": "voice", "note": "contact phone", "value": parsed_form['registrant']['registrant_contact_phone'], }, { "type": "email", "note": "contact email", "value": parsed_form['registrant']['registrant_contact_email'], } ] for cd in main_contact_contact_details: _main_contact.add_contact_detail(**cd) if _registrant._type == 'organization': _registrant.add_member( name_or_person=_main_contact, role='main_contact', label='main contact for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: _registrant_self_employment.add_member( name_or_person=_main_contact, role='main_contact', label='main contact for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) # # Client # build client _client = Organization( name=parsed_form['client']['client_name'], classification='company', source_identified=True ) client_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ parsed_form['client']['client_address'], parsed_form['client']['client_city'], parsed_form['client']['client_state'], parsed_form['client']['client_zip'], parsed_form['client']['client_country']] if len(p) > 0]).strip(), }, ] client_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ parsed_form['client']['client_ppb_city'], parsed_form['client']['client_ppb_state'], parsed_form['client']['client_ppb_zip'], parsed_form['client']['client_ppb_country']] if len(p) > 0]).strip(), } if client_contact_ppb["value"]: client_contact_details.append(client_contact_ppb) for cd in client_contact_details: _client.add_contact_detail(**cd) _client.extras = { "contact_details_structured": [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": parsed_form['client']['client_address'], }, { "note": "city", "value": parsed_form['client']['client_city'], }, { "note": "state", "value": parsed_form['client']['client_state'], }, { "note": "zip", "value": parsed_form['client']['client_zip'], }, { "note": "country", "value": parsed_form['client']['client_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": parsed_form['client']['client_ppb_city'], }, { "note": "state", "value": parsed_form['client']['client_ppb_state'], }, { "note": "zip", "value": parsed_form['client']['client_ppb_zip'], }, { "note": "country", "value": parsed_form['client'][ 'client_ppb_country'], } ], }, ], } # Collect Foreign Entities _foreign_entities = [] _foreign_entities_by_name = {} for fe in parsed_form['foreign_entities']: fe_extras = {} fe_name = fe['foreign_entity_name'] # check for name-based duplicates if fe_name in _foreign_entities_by_name: _foreign_entity = _foreign_entities_by_name[fe_name] else: _foreign_entity = Organization( name=fe_name, classification='company', source_identified=True ) # collect contact details foreign_entity_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ fe['foreign_entity_address'], fe['foreign_entity_city'], fe['foreign_entity_state'], fe['foreign_entity_country']] if len(p) > 0]).strip(), }, { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ fe['foreign_entity_ppb_state'], fe['foreign_entity_ppb_country']] if len(p) > 0]).strip(), }, ] foreign_entity_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ fe['foreign_entity_ppb_city'], fe['foreign_entity_ppb_state'], fe['foreign_entity_ppb_country']] if len(p) > 0]), } if foreign_entity_contact_ppb["value"]: foreign_entity_contact_details.append( foreign_entity_contact_ppb) # add contact details for cd in foreign_entity_contact_details: if cd['value'] != '': _foreign_entity.add_contact_detail(**cd) # add extras fe_extras["contact_details_structured"] = [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": fe['foreign_entity_address'], }, { "note": "city", "value": fe['foreign_entity_city'], }, { "note": "state", "value": fe['foreign_entity_state'], }, { "note": "country", "value": fe['foreign_entity_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "state", "value": fe['foreign_entity_ppb_state'], }, { "note": "country", "value": fe['foreign_entity_ppb_country'], } ], }, ] _foreign_entity.extras = combine_dicts(_foreign_entity.extras, fe_extras) _foreign_entities_by_name[fe_name] = _foreign_entity for unique_foreign_entity in _foreign_entities_by_name.values(): _foreign_entities.append(unique_foreign_entity) # TODO: add a variant on memberships to represent inter-org # relationships (associations, ownership, etc) # # _client['memberships'].append({ # "id": _foreign_entity['id'], # "classification": "organization", # "name": _foreign_entity['name'], # "extras": { # "ownership_percentage": # fe['foreign_entity_amount'] # } # }) # Collect Lobbyists # TODO: deal with wierd non-name line continuation cases (blanks, "continued") _lobbyists_by_name = {} for l in parsed_form['lobbyists']: l_extras = {} l_name = ' '.join([l['lobbyist_first_name'], l['lobbyist_last_name'], l['lobbyist_suffix'] ]).strip() if l_name in _lobbyists_by_name: _lobbyist = _lobbyists_by_name[l_name] else: _lobbyist = Person( name=l_name, source_identified=True ) if l['lobbyist_covered_official_position']: l_extras['lda_covered_official_positions'] = [ { 'date_reported': parsed_form['datetimes']['effective_date'], 'covered_official_position': l['lobbyist_covered_official_position'] }, ] _lobbyist.extras = combine_dicts(_lobbyist.extras, l_extras) _lobbyists_by_name[l_name] = _lobbyist _lobbyists = [] for unique_lobbyist in _lobbyists_by_name.values(): _lobbyists.append(unique_lobbyist) if _registrant._type == 'organization': for l in _lobbyists: _registrant.add_member( l, role='lobbyist', label='lobbyist for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: for l in _lobbyists: _registrant_self_employment.add_member( l, role='lobbyist', label='lobbyist for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) # # Document # build document _disclosure.add_document( note='submitted filing', date=parsed_form['datetimes']['effective_date'][:10], url=response.url ) # Collect Affiliated orgs _affiliated_organizations = [] _affiliated_organizations_by_name = {} for ao in parsed_form['affiliated_organizations']: ao_extras = {} ao_name = ao['affiliated_organization_name'] if ao_name in _affiliated_organizations_by_name: # There's already one by this name _affiliated_organization = _affiliated_organizations_by_name[ao_name] else: # New affiliated org _affiliated_organization = Organization( name=ao_name, classification='company', source_identified=True ) # collect contact details affiliated_organization_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ ao['affiliated_organization_address'], ao['affiliated_organization_city'], ao['affiliated_organization_state'], ao['affiliated_organization_zip'], ao['affiliated_organization_country']] if len(p) > 0]).strip(), }, ] affiliated_organization_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ ao['affiliated_organization_ppb_city'], ao['affiliated_organization_ppb_state'], ao['affiliated_organization_ppb_country']] if len(p) > 0]).strip(), } if affiliated_organization_contact_ppb["value"]: affiliated_organization_contact_details.append( affiliated_organization_contact_ppb) # add contact details for cd in affiliated_organization_contact_details: _affiliated_organization.add_contact_detail(**cd) ao_extras["contact_details_structured"] = [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": ao['affiliated_organization_address'], }, { "note": "city", "value": ao['affiliated_organization_city'], }, { "note": "state", "value": ao['affiliated_organization_state'], }, { "note": "zip", "value": ao['affiliated_organization_zip'], }, { "note": "country", "value": ao['affiliated_organization_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": ao['affiliated_organization_ppb_city'], }, { "note": "state", "value": ao['affiliated_organization_ppb_state'], }, { "note": "country", "value": ao['affiliated_organization_ppb_country'], } ], }, ], _affiliated_organization.extras = combine_dicts( _affiliated_organization.extras, ao_extras) for unique_affiliated_organization in _affiliated_organizations_by_name.values(): _affiliated_organizations.append(unique_affiliated_organization) # # Events & Agendas # name if parsed_form['registration_type']['new_registrant']: registration_type = 'New Client, New Registrant' elif parsed_form['registration_type']['is_amendment']: registration_type = 'Amended Registration' else: registration_type = 'New Client for Existing Registrant' # Create registration event _event = Event( name="{rn} - {rt}, {cn}".format(rn=_registrant.name, rt=registration_type, cn=_client.name), timezone='America/New_York', location='United States', start_time=datetime.strptime( parsed_form['datetimes']['effective_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), classification='registration' ) # add participants _event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") if _registrant._type == 'person': _event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") _event.add_participant(type=_client._type, id=_client._id, name=_client.name, note="client") for l in _lobbyists: _event.add_participant(type=l._type, id=l._id, name=l.name, note='lobbyist') for fe in _foreign_entities: _event.add_participant(type=fe._type, id=fe._id, name=fe.name, note='foreign_entity') for ao in _affiliated_organizations: _event.add_participant(type=ao._type, id=ao._id, name=ao.name, note='affiliated_organization') # add agenda item _agenda = _event.add_agenda_item( description='issues lobbied on', ) _agenda['notes'].append( parsed_form['lobbying_issues_detail'] ) for li in parsed_form['lobbying_issues']: if li['general_issue_area'] != '': _agenda.add_subject(li['general_issue_area']) _disclosure.add_disclosed_event( name=_event.name, type=_event._type, classification=_event.classification, id=_event._id ) # add registrant to disclosure's _related and related_entities fields _disclosure.add_registrant(name=_registrant.name, type=_registrant._type, id=_registrant._id) _registrant.add_source( url=_source['url'], note='registrant' ) yield _registrant if _registrant_self_employment is not None: _registrant_self_employment.add_source( url=_source['url'], note='registrant_self_employment' ) yield _registrant_self_employment _client.add_source( url=_source['url'], note='client' ) yield _client _main_contact.add_source( url=_source['url'], note='main_contact' ) yield _main_contact for ao in _affiliated_organizations: ao.add_source( url=_source['url'], note='affiliated_organization' ) yield ao for fe in _foreign_entities: fe.add_source( url=_source['url'], note='foreign_entity' ) yield fe for l in _lobbyists: l.add_source( url=_source['url'], note='lobbyist' ) yield l _event.add_source(**_source) yield _event _disclosure.add_source(**_source) yield _disclosure
def scrape_current_legislators(self, repos): for repo in repos: CURRENT_LEGISLATORS = self.get_url(repo) people = self.yamlize(CURRENT_LEGISLATORS) parties = set() posts = {} person_cache = defaultdict(lambda: defaultdict(lambda: None)) for person in people: name = person['name'].get('official_full') if name is None: name = "{name[first]} {name[last]}".format(**person) if 'birthday' in person['bio']: birth_date = person['bio']['birthday'] who = person_cache[name][birth_date] has_term = False if who is None: who = Person(name=name, birth_date=birth_date) who.add_source(url=CURRENT_LEGISLATORS, note="unitedstates project on GitHub") for term in person.get('terms', []): has_term = True start_date = term['start'] end_date = term['end'] state = term['state'] type_ = term['type'] district = term.get('district', None) party = term.get('party', None) chamber = { 'rep': 'lower', 'sen': 'upper', }[type_] role = { 'rep': 'Representative', 'sen': 'Senator', }[type_] if type_ == "rep" and district is not None: label = "%s for District %s in %s" % (role, district, state) if district == 0: division_id = ( "ocd-division/country:us/state:{state}".format( state=state.lower())) else: division_id = ( "ocd-division/country:us/" "state:{state}/cd:{district}".format( state=state.lower(), district=district)) post = posts.get(division_id) if post is None: post = Post(organization_id={ "rep": self.house, "sen": self.senate }[type_]._id, division_id=division_id, label=label, role=role) posts[division_id] = post yield post membership = Membership(post_id=post._id, role=role, label=label, start_date=start_date, end_date=end_date, person_id=who._id, organization_id={ "rep": self.house, "sen": self.senate, }[type_]._id) yield membership if type_ == "sen": division_id = ( "ocd-division/country:us/state:{state}".format( state=state.lower())) label = "Senitor for %s" % (state) post = posts.get(division_id) if post is None: post = Post(organization_id={ "rep": self.house, "sen": self.senate }[type_]._id, division_id=division_id, label=label, role=role) posts[division_id] = post yield post membership = Membership(post_id=post._id, role=role, label=label, start_date=start_date, end_date=end_date, person_id=who._id, organization_id={ "rep": self.house, "sen": self.senate, }[type_]._id) yield membership if party == "Democrat": party = "Democratic" if party: membership = Membership(role='member', start_date=start_date, end_date=end_date, person_id=who._id, organization_id=make_pseudo_id( classification="party", name=party)) yield membership for key, value in person.get('id', {}).items(): if isinstance(value, list): for v in value: who.add_identifier(str(v), scheme=key) else: who.add_identifier(str(value), scheme=key) if has_term: yield who