def test_simple_merge(self): # create person2 first but ensure it's created_at gets carried over person2 = Person.objects.create(name='Barack Obama', sort_name='Obama', gender='Male') person1 = Person.objects.create(name='Barack Obama', sort_name='Barack') p2_id = person2.id p2_created = person2.created_at merge(person1, person2) self.assertEqual(Person.objects.count(), 1, "There should only be one person after merge.") obama = Person.objects.get() # add identifier self.assertEqual(obama.identifiers.get().identifier, p2_id) # don't add other name if they match self.assertEqual(obama.other_names.count(), 0) self.assertEqual(obama.sort_name, 'Barack', "Simple fields should take obj1's properties") self.assertEqual(obama.created_at, p2_created) self.assertGreater(obama.updated_at, obama.created_at) self.assertEqual(obama.gender, 'Male', "Data should trump empty, no matter the update order.") assert set(obama.locked_fields) == {'identifiers', 'gender', 'sort_name'}
def merge_persons(persons): """ Merge items in persons iterable into one Person object, which is returned. """ # each person will be merged into this one keep = persons.pop(0) if keep.__class__ != OCDPersonProxy: keep.__class__ = OCDPersonProxy # loop over all the rest for i in persons: if i.__class__ != OCDPersonProxy: i.__class__ = OCDPersonProxy merge(keep, i) keep.refresh_from_db() dedupe_person_ids(keep) dedupe_person_candidacies(keep) keep.refresh_from_db() # make sure Person name is same as most recent candidate_name latest_candidate_name = keep.candidacies.latest( 'contest__election__date', ).candidate_name if keep.name != latest_candidate_name: # move current Person.name into other_names keep.add_other_name(keep.name, 'Updated current name in merge') keep.name = latest_candidate_name keep.save() return keep
def test_other_name_merge(self): person1 = Person.objects.create(name='Barack Obama') person2 = Person.objects.create(name='Barry Obama') merge(person1, person2) # moved name into other_names assert person1.other_names.get().name == 'Barry Obama' # check locked_fields obama = Person.objects.get() assert set(obama.locked_fields) == {'identifiers', 'name', 'other_names'}
def test_merge_contact_details(self): person1 = Person.objects.create(name='Barack Obama') person2 = Person.objects.create(name='Barack Obama') person1.contact_details.create(type='fax', value='555-123-4567') person2.contact_details.create(type='fax', value='555-123-4567', note="Throw out your fax!") person1.contact_details.create(type='email', value='*****@*****.**') person2.contact_details.create(type='email', value='*****@*****.**') merge(person1, person2) obama = Person.objects.get() # no deduping for now self.assertEqual(obama.contact_details.count(), 4) assert set(obama.locked_fields) == {'identifiers', 'contact_details'}
def test_merge_related_obj(self): person1 = Person.objects.create(name='Barack Obama') person2 = Person.objects.create(name='Barack Obama') d = Division.objects.create(id='ocd-division/country:us', name='US') j = Jurisdiction.objects.create(name='US', division=d) l = j.legislative_sessions.create(name='2015') b = Bill.objects.create(identifier='HB 1', legislative_session=l) sp = BillSponsorship.objects.create(bill=b, person=person2) merge(person1, person2) sp = BillSponsorship.objects.get() assert sp.person == person1 person = Person.objects.get() assert set(person.locked_fields) == {'identifiers'}
def merge_persons(self, filer_id): """ Merge the Person objects that share the same CAL-ACCESS filer_id. Return the merged Person object. """ persons = Person.objects.filter( identifiers__scheme='calaccess_filer_id', identifiers__identifier=filer_id, ).all() if self.verbosity > 2: self.log( "Merging {0} Persons sharing filer_id {1}".format( len(persons), filer_id, ) ) # each person will be merged into this one survivor = persons[0] # loop over all the rest of them for i in range(1, len(persons)): if survivor.id != persons[i].id: if ( survivor.name != persons[i].name or survivor.sort_name != persons[i].sort_name ): import ipdb; ipdb.set_trace() # noqa else: merge(survivor, persons[i]) # also delete the now duplicated PersonIdentifier objects if survivor.identifiers.count() > 1: for i in survivor.identifiers.filter(scheme='calaccess_filer_id')[1:]: i.delete() return survivor
def test_merge_memberships(self): person1 = Person.objects.create(name='Barack Obama') person2 = Person.objects.create(name='Barack Obama') d = Division.objects.create(id='ocd-division/country:us', name='US') j = Jurisdiction.objects.create(name='US', division=d) dem = Organization.objects.create(name='Democratic', classification='party') gov = Organization.objects.create(name='Federal Government', jurisdiction=j, classification='government') # this isn't how you'd really use posts pres = Post.objects.create(label='President', organization=gov) sen = Post.objects.create(label='Senator', organization=gov) Membership.objects.create(organization=dem, person=person1) Membership.objects.create(organization=gov, person=person1, post=pres) Membership.objects.create(organization=dem, person=person2) Membership.objects.create(organization=gov, person=person1, post=sen) merge(person1, person2) person = Person.objects.get() # ensure that the party memberships are deduped and others are kept assert person.memberships.count() == 3 assert pres.memberships.count() == 1 assert sen.memberships.count() == 1 assert set(person.locked_fields) == {'memberships', 'identifiers'}
def test_no_self_merge(self): person1 = Person.objects.create(name='Barack Obama') with self.assertRaises(ValueError): merge(person1, person1)
def merge(self, persons): """ Merge items in persons iterable into one Person object. Return the merged Person object. """ # each person will be merged into this one keep = persons.pop(0) # loop over all the rest for i in persons: merge(keep, i) keep.refresh_from_db() # also delete the now duplicated PersonIdentifier objects keep_filer_ids = keep.identifiers.filter(scheme='calaccess_filer_id') dupe_filer_ids = keep_filer_ids.values("identifier").annotate( row_count=Count('id'), ).order_by().filter(row_count__gt=1) for i in dupe_filer_ids.all(): # delete all rows with that filer_id keep_filer_ids.filter(identifier=i['identifier']).delete() # then re-add the one keep.identifiers.create( scheme='calaccess_filer_id', identifier=i['identifier'], ) # and dedupe candidacy records # first, make groups by contests with more than one candidacy contest_group_q = keep.candidacies.values("contest").annotate( row_count=Count('id')).filter(row_count__gt=1) # loop over each contest group for group in contest_group_q.all(): cands = keep.candidacies.filter(contest=group['contest']) # preference to "qualified" candidacy (from scrape) if cands.filter(registration_status='qualified').exists(): cand_to_keep = cands.filter( registration_status='qualified').all()[0] # or the one with the most recent filed_date else: cand_to_keep = cands.latest('filed_date') # loop over all the other candidacies in the group for cand_to_discard in cands.exclude(id=cand_to_keep.id).all(): # assuming the only thing in extras is form501_filing_ids if 'form501_filing_ids' in cand_to_discard.extras: for i in cand_to_discard.extras['form501_filing_ids']: self.link_form501_to_candidacy(i, cand_to_keep) cand_to_keep.refresh_from_db() if 'form501_filing_ids' in cand_to_keep.extras: self.update_candidacy_from_form501s(cand_to_keep) cand_to_keep.refresh_from_db() # keep the candidate_name, if not already somewhere else if (cand_to_discard.candidate_name != cand_to_keep.candidate_name and cand_to_discard.candidate_name != cand_to_keep.person.name and not cand_to_keep.person.other_names.filter( name=cand_to_discard.candidate_name).exists()): keep.other_names.create( name=cand_to_discard.candidate_name, note='From merge of %s candidacies' % cand_to_keep.contest) cand_to_keep.refresh_from_db() # keep the candidacy sources if cand_to_discard.sources.exists(): for source in cand_to_discard.sources.all(): if not cand_to_keep.sources.filter( url=source.url).exists(): cand_to_keep.sources.create( url=source.url, note=source.note, ) cand_to_keep.refresh_from_db() # keep earliest filed_date if cand_to_keep.filed_date and cand_to_discard.filed_date: if cand_to_keep.filed_date > cand_to_discard.filed_date: cand_to_keep.filed_date = cand_to_discard.filed_date elif cand_to_discard.filed_date: cand_to_keep.filed_date = cand_to_discard.filed_date # keep is_incumbent if True if not cand_to_keep.is_incumbent and cand_to_discard.is_incumbent: cand_to_keep.is_incumbent = cand_to_discard.is_incumbent # assuming not trying to merge candidacies with different parties if not cand_to_keep.party and cand_to_discard.party: cand_to_keep.party = cand_to_discard.party cand_to_keep.save() cand_to_discard.delete() keep.refresh_from_db() # make sure Person name is same as most recent candidate_name latest_candidate_name = keep.candidacies.latest( 'contest__election__date', ).candidate_name if keep.name != latest_candidate_name: # move current Person.name into other_names if not keep.other_names.filter(name=keep.name).exists(): keep.other_names.create(name=keep.name) keep.name = latest_candidate_name keep.save() return keep