def test_same_name_people(): # ensure two people with the same name don't import without birthdays o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id') p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2') # the people have the same name but are apparently different with pytest.raises(SameNameError): PersonImporter('jurisdiction-id').import_data( [p1.as_dict(), p2.as_dict()]) # when we give them birth dates all is well though p1.birth_date = '1970' p2.birth_date = '1930' resp = PersonImporter('jurisdiction-id').import_data( [p1.as_dict(), p2.as_dict()]) assert resp['person']['insert'] == 2 assert resp['person']['noop'] == 0 assert resp['person']['update'] == 0 assert Person.objects.count() == 2 # fake some memberships so future lookups work on these people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) # and now test that an update works and we can insert a new one with the same name p1.image = 'http://example.com/1.jpg' p2.birth_date = '1931' # change birth_date, means a new insert resp = PersonImporter('jurisdiction-id').import_data( [p1.as_dict(), p2.as_dict()]) assert Person.objects.count() == 3 assert resp['person']['insert'] == 1 assert resp['person']['noop'] == 0 assert resp['person']['update'] == 1
def test_same_name_people(): # ensure two people with the same name don't import without birthdays o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id') p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2') # the people have the same name but are apparently different with pytest.raises(SameNameError): PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()]) # when we give them birth dates all is well though p1.birth_date = '1970' p2.birth_date = '1930' resp = PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()]) assert resp['person']['insert'] == 2 assert resp['person']['noop'] == 0 assert resp['person']['update'] == 0 assert Person.objects.count() == 2 # fake some memberships so future lookups work on these people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) # and now test that an update works and we can insert a new one with the same name p1.image = 'http://example.com/1.jpg' p2.birth_date = '1931' # change birth_date, means a new insert resp = PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()]) assert Person.objects.count() == 3 assert resp['person']['insert'] == 1 assert resp['person']['noop'] == 0 assert resp['person']['update'] == 1
def test_same_name_people(): o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id') # importing two people with the same name to a pristine database should error p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2') with pytest.raises(SameNameError): PersonImporter('jurisdiction-id').import_data( [p1.as_dict(), p2.as_dict()]) # importing one person should pass PersonImporter('jurisdiction-id').import_data([p1.as_dict()]) # create fake memberships so that future lookups work on the imported people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) # importing another person with the same name should fail with pytest.raises(SameNameError): PersonImporter('jurisdiction-id').import_data( [p1.as_dict(), p2.as_dict()]) # adding birth dates should pass p1.birth_date = '1970' p2.birth_date = '1930' resp = PersonImporter('jurisdiction-id').import_data( [p1.as_dict(), p2.as_dict()]) assert resp['person']['insert'] == 1 assert resp['person']['noop'] == 0 assert resp['person']['update'] == 1 assert Person.objects.count() == 2 # create fake memberships so that future lookups work on the imported people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) # adding a third person with the same name but without a birthday should error p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3') with pytest.raises(SameNameError): PersonImporter('jurisdiction-id').import_data([p3.as_dict()]) # and now test that an update works and we can insert a new one with the same name p1.image = 'http://example.com/1.jpg' p2.birth_date = '1931' # change birth_date, means a new insert resp = PersonImporter('jurisdiction-id').import_data( [p1.as_dict(), p2.as_dict()]) assert Person.objects.count() == 3 assert resp['person']['insert'] == 1 assert resp['person']['noop'] == 0 assert resp['person']['update'] == 1
def test_same_name_people(): o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id') # importing two people with the same name to a pristine database should error p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2') with pytest.raises(SameNameError): PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()]) # importing one person should pass PersonImporter('jurisdiction-id').import_data([p1.as_dict()]) # create fake memberships so that future lookups work on the imported people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) # importing another person with the same name should fail with pytest.raises(SameNameError): PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()]) # adding birth dates should pass p1.birth_date = '1970' p2.birth_date = '1930' resp = PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()]) assert resp['person']['insert'] == 1 assert resp['person']['noop'] == 0 assert resp['person']['update'] == 1 assert Person.objects.count() == 2 # create fake memberships so that future lookups work on the imported people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) # adding a third person with the same name but without a birthday should error p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3') with pytest.raises(SameNameError): PersonImporter('jurisdiction-id').import_data([p3.as_dict()]) # and now test that an update works and we can insert a new one with the same name p1.image = 'http://example.com/1.jpg' p2.birth_date = '1931' # change birth_date, means a new insert resp = PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()]) assert Person.objects.count() == 3 assert resp['person']['insert'] == 1 assert resp['person']['noop'] == 0 assert resp['person']['update'] == 1
def test_same_name_second_import(): # ensure two people with the same name don't import without birthdays o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id') p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2') p1.birth_date = '1970' p2.birth_date = '1930' # when we give them birth dates all is well though resp = PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()]) # fake some memberships so future lookups work on these people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3') with pytest.raises(SameNameError): resp = PersonImporter('jurisdiction-id').import_data([p3.as_dict()])
def test_same_name_second_import(): create_jurisdiction() # ensure two people with the same name don't import without birthdays o = Organization.objects.create(name='WWE', jurisdiction_id='jid') p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2') p1.birth_date = '1970' p2.birth_date = '1930' # when we give them birth dates all is well though PersonImporter('jid').import_data([p1.as_dict(), p2.as_dict()]) # fake some memberships so future lookups work on these people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3') with pytest.raises(SameNameError): PersonImporter('jid').import_data([p3.as_dict()])
def scrape_legislator(self, legislator_id): old = self.api('legislators/' + legislator_id + '?') # just not needed id = old.pop('id') old.pop('created_at') old.pop('updated_at') old.pop('country', None) old.pop('level', None) old.pop('state') old.pop('leg_id') old.pop('active') # junk keys old.pop('suffix', None) old.pop('notice', None) old.pop('csrfmiddlewaretoken', None) old.pop('office_address', None) old.pop('office_phone', None) # translated district = old.pop('district', None) chamber = old.pop('chamber', None) image = old.pop('photo_url', '') name = old.pop('full_name') party = old.pop('party', None) if party in ('Nonpartisan', 'unknown', 'Unknown', 'Unaffiliated', "Non Affiliated", " "): party = None elif party == 'Democrat': party = 'Democratic' if self.state in('ne', 'dc'): chamber = 'legislature' if chamber == 'upper' and self.state == 'pr': pr_district = { '1': 'I', '2': 'II', '3': 'III', '4': 'IV', '5': 'V', '6': 'VI', '7': 'VII', '8': 'VIII', } if district in pr_district: district = pr_district[district] if '2008-2011' in old: old['old_roles']['2008-2011'] = old.pop('2008-2011') old_roles = old.pop('old_roles', {}) if old['roles'] and 'Lt. Governor' in [x['type'] for x in old['roles']]: new = Person(name=name, district=district, party=party, image=image) self.jurisdiction._executive.add_post( 'Lt. Governor', 'lt-gov' ) membership = Membership( person_id=new._id, role="Lt. Governor", organization_id=self.jurisdiction._executive._id ) new._related.append(membership) else: new = Person(name=name, party=party, image=image) if id in birthdays: new.birth_date = birthdays[id] # various ids id_types = {'votesmart_id': 'votesmart', 'transparencydata_id': 'influence-explorer', 'nimsp_id': 'nimsp', 'nimsp_candidate_id': 'nimsp-candidate', } for idname, scheme in id_types.items(): val = old.pop(idname, None) if val: new.add_identifier(val, scheme=scheme) for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') self._people[id] = new # contact details email = old.pop('email', None) if email: new.add_contact_detail(type='email', value=email, note='') office_keys = {'fax': 'fax', 'phone': 'voice', 'email': 'email', 'address': 'address'} for office in old.pop('offices'): for key, type in office_keys.items(): if office.get(key): if 'Office Hours' in office[key] and self.state == 'pa': for x in office[key].split('Office Hours: '): if x: new.add_contact_detail(type=type, value=x, note=office['name']) else: new.add_contact_detail(type=type, value=office[key], note=office['name']) # links link = old.pop('url', None) if link: new.add_link(link) #for utah, conflict of interest is in links if self.state == 'ut': links = old.pop('+links',[]) for l in links: new.add_link(note="conflict of interest form",url=l) # sources for source in old.pop('sources'): source.pop('retrieved', None) source.pop('+page', None) new.add_source(**source) # roles for role in old.pop('roles'): self.process_role(new, role, leg_id=id) for role_list in old_roles.values(): for role in role_list: self.process_role(new, role, leg_id=id) # ignore most of the names for now old.pop('first_name') old.pop('middle_name') old.pop('suffixes') old.pop('nickname', None) new.sort_name = old.pop('last_name') #some places have legacy names without underscores old.pop('+firstname', None) old.pop('+lastname', None) gender = old.pop('+gender', None) if gender: new.gender = gender biography = old.pop('+biography', None) if biography: new.biography = biography birth_date = old.pop('+birth_date', None) if birth_date: new.birth_date = birth_date # keys to keep to_extras = ['+occupation', '+twitter', '+facebook_url', '+sworn_in_date', '+profession', '+secretary', '+office_hours', '+resident_county', '+district_name', '+leg_status', '+legal_position', '+title', '+start_year', '+end_date', 'occupation', '+oregon_member_id', '+facebook', '+youtube', '+instagram'] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v # keys not to keep to_pop = ['+office_fax', '+phone', '+room', '+fax', '+email', '+url', '+photo', '+notice', '+page', '+suffix', '+city', '+address', '+additional_info_url', '+contact_form', '+fax_number', '+phone_number', '+business_phone', '+email_address', '+img_url', '+office_phone', '+disctict_name', '+office_loc', '+leg_url', '+office', '+district_address', '+capital_address', '+bis_phone', '+capital_phone', '+org_info', '+role', '+other_phone', '+home_phone', '+zip', '+zipcode', '+county', '+capitol_phone', '+image_url', '+header', '+town_represented', '+full_address', '+capitol_address', '+website', '+district_phone', '+district_offices', '+party', '+district', '+capitol_office', '+office_address', ] for k in to_pop: old.pop(k, None) # ensure we got it all assert not old, old.keys() return new
def scrape_legislator(self, legislator_id): old = self.api('legislators/' + legislator_id + '?') # just not needed id = old.pop('id') old.pop('created_at') old.pop('updated_at') old.pop('country', None) old.pop('level', None) old.pop('state') old.pop('leg_id') old.pop('active') # junk keys old.pop('suffix', None) old.pop('notice', None) old.pop('csrfmiddlewaretoken', None) old.pop('office_address', None) old.pop('office_phone', None) # translated district = old.pop('district', None) chamber = old.pop('chamber', None) image = old.pop('photo_url', '') name = old.pop('full_name') party = old.pop('party', None) if party in ('Nonpartisan', 'unknown', 'Unknown', 'Unaffiliated', "Non Affiliated", " "): party = None elif party == 'Democrat': party = 'Democratic' if self.state in ('ne', 'dc'): chamber = 'legislature' if chamber == 'upper' and self.state == 'pr': pr_district = { '1': 'I', '2': 'II', '3': 'III', '4': 'IV', '5': 'V', '6': 'VI', '7': 'VII', '8': 'VIII', } if district in pr_district: district = pr_district[district] if '2008-2011' in old: old['old_roles']['2008-2011'] = old.pop('2008-2011') old_roles = old.pop('old_roles', {}) if old['roles'] and 'Lt. Governor' in [ x['type'] for x in old['roles'] ]: new = Person(name=name, district=district, party=party, image=image) self.jurisdiction._executive.add_post('Lt. Governor', 'lt-gov') membership = Membership( person_id=new._id, role="Lt. Governor", organization_id=self.jurisdiction._executive._id) new._related.append(membership) else: new = Person(name=name, party=party, image=image) if id in birthdays: new.birth_date = birthdays[id] # various ids id_types = { 'votesmart_id': 'votesmart', 'transparencydata_id': 'influence-explorer', 'nimsp_id': 'nimsp', 'nimsp_candidate_id': 'nimsp-candidate', } for idname, scheme in id_types.items(): val = old.pop(idname, None) if val: new.add_identifier(val, scheme=scheme) for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') self._people[id] = new # contact details email = old.pop('email', None) if email: new.add_contact_detail(type='email', value=email, note='') office_keys = { 'fax': 'fax', 'phone': 'voice', 'email': 'email', 'address': 'address' } for office in old.pop('offices'): for key, type in office_keys.items(): if office.get(key): if 'Office Hours' in office[key] and self.state == 'pa': for x in office[key].split('Office Hours: '): if x: new.add_contact_detail(type=type, value=x, note=office['name']) else: new.add_contact_detail(type=type, value=office[key], note=office['name']) # links link = old.pop('url', None) if link: new.add_link(link) #for utah, conflict of interest is in links if self.state == 'ut': links = old.pop('+links', []) for l in links: new.add_link(note="conflict of interest form", url=l) # sources for source in old.pop('sources'): source.pop('retrieved', None) source.pop('+page', None) new.add_source(**source) # roles for role in old.pop('roles'): self.process_role(new, role, leg_id=id) for role_list in old_roles.values(): for role in role_list: self.process_role(new, role, leg_id=id) # ignore most of the names for now old.pop('first_name') old.pop('middle_name') old.pop('suffixes') old.pop('nickname', None) new.sort_name = old.pop('last_name') #some places have legacy names without underscores old.pop('+firstname', None) old.pop('+lastname', None) gender = old.pop('+gender', None) if gender: new.gender = gender biography = old.pop('+biography', None) if biography: new.biography = biography birth_date = old.pop('+birth_date', None) if birth_date: new.birth_date = birth_date # keys to keep to_extras = [ '+occupation', '+twitter', '+facebook_url', '+sworn_in_date', '+profession', '+secretary', '+office_hours', '+resident_county', '+district_name', '+leg_status', '+legal_position', '+title', '+start_year', '+end_date', 'occupation', '+oregon_member_id', '+facebook', '+youtube', '+instagram' ] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v # keys not to keep to_pop = [ '+office_fax', '+phone', '+room', '+fax', '+email', '+url', '+photo', '+notice', '+page', '+suffix', '+city', '+address', '+additional_info_url', '+contact_form', '+fax_number', '+phone_number', '+business_phone', '+email_address', '+img_url', '+office_phone', '+disctict_name', '+office_loc', '+leg_url', '+office', '+district_address', '+capital_address', '+bis_phone', '+capital_phone', '+org_info', '+role', '+other_phone', '+home_phone', '+zip', '+zipcode', '+county', '+capitol_phone', '+image_url', '+header', '+town_represented', '+full_address', '+capitol_address', '+website', '+district_phone', '+district_offices', '+party', '+district', '+capitol_office', '+office_address', ] for k in to_pop: old.pop(k, None) # ensure we got it all assert not old, old.keys() return new
def legislators(self, latest_only): legs = {} for member, chamber, term, url in self._memberships(latest_only): name, _, _, district, party = member.xpath('td') district = district.text detail_url = name.xpath('a/@href')[0] if party.text_content().strip() == "": self.warning("Garbage party: Skipping!") continue party = { 'D': 'Democratic', 'R': 'Republican', 'I': 'Independent' }[party.text] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith('*'): name = name.strip('*') continue name = AKA.get(name, name) if name in legs: p, terms = legs[name] terms.append((chamber, district, term, party)) else: p = Person(name, party=party) legs[name] = p, [(chamber, district, term, party)] p.add_source(url) p.add_source(detail_url) p.add_link(detail_url) birth_date = BIRTH_DATES.get(name, None) if birth_date: p.birth_date = birth_date leg_html = self.get(detail_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(detail_url) hotgarbage = ('Senate Biography Information for the 98th General ' 'Assembly is not currently available.') if hotgarbage in leg_html: # The legislator's bio isn't available yet. self.logger.warning('No legislator bio available for ' + name) continue photo_url = leg_doc.xpath( '//img[contains(@src, "/members/")]/@src')[0] p.image = photo_url p.contact_details = [] # email email = leg_doc.xpath('//b[text()="Email: "]') if email: p.add_contact_detail(type='email', value=email[0].tail.strip(), note='capitol') offices = { 'capitol': '//table[contains(string(), "Springfield Office")]', 'district': '//table[contains(string(), "District Office")]' } for location, xpath in offices.items(): table = leg_doc.xpath(xpath) if table: for type, value in self._table_to_office(table[3]): p.add_contact_detail(type=type, value=value, note=location) return legs
def legislators(self, latest_only): legs = {} for member, chamber, term, url in self._memberships(latest_only): name, _, _, district, party = member.xpath('td') district = district.text detail_url = name.xpath('a/@href')[0] if party.text_content().strip() == "": self.warning("Garbage party: Skipping!") continue party = {'D': 'Democratic', 'R': 'Republican', 'I': 'Independent'}[party.text] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith('*'): name = name.strip('*') continue name = AKA.get(name, name) if name in legs: p, terms = legs[name] terms.append((chamber, district, term, party)) else: p = Person(name, party=party) legs[name] = p, [(chamber, district, term, party)] p.add_source(url) p.add_source(detail_url) p.add_link(detail_url) birth_date = BIRTH_DATES.get(name, None) if birth_date: p.birth_date = birth_date leg_html = self.get(detail_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(detail_url) hotgarbage = ( 'Senate Biography Information for the 98th General ' 'Assembly is not currently available.') if hotgarbage in leg_html: # The legislator's bio isn't available yet. self.logger.warning('No legislator bio available for ' + name) continue photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0] p.image = photo_url p.contact_details = [] # email email = leg_doc.xpath('//b[text()="Email: "]') if email: p.add_contact_detail(type='email', value=email[0].tail.strip(), note='capitol') offices = {'capitol': '//table[contains(string(), "Springfield Office")]', 'district': '//table[contains(string(), "District Office")]'} for location, xpath in offices.items(): table = leg_doc.xpath(xpath) if table: for type, value in self._table_to_office(table[3]): if type in ('fax', 'voice') and not validate_phone_number(value): continue p.add_contact_detail(type=type, value=value, note=location) return legs