def test_simple_merge(self):
        # create person2 first but ensure it's created_at gets carried over
        person2 = Person.objects.create(name='Barack Obama',
                                        sort_name='Obama',
                                        gender='Male')
        person1 = Person.objects.create(name='Barack Obama',
                                        sort_name='Barack')
        p2_id = person2.id
        p2_created = person2.created_at

        merge(person1, person2)
        self.assertEqual(Person.objects.count(), 1,
                         "There should only be one person after merge.")
        obama = Person.objects.get()
        # add identifier
        self.assertEqual(obama.identifiers.get().identifier, p2_id)
        # don't add other name if they match
        self.assertEqual(obama.other_names.count(), 0)
        self.assertEqual(obama.sort_name, 'Barack',
                         "Simple fields should take obj1's properties")
        self.assertEqual(obama.created_at, p2_created)
        self.assertGreater(obama.updated_at, obama.created_at)
        self.assertEqual(obama.gender, 'Male',
                         "Data should trump empty, no matter the update order.")
        assert set(obama.locked_fields) == {'identifiers', 'gender', 'sort_name'}
Пример #2
0
def merge_persons(persons):
    """
    Merge items in persons iterable into one Person object, which is returned.
    """
    # each person will be merged into this one
    keep = persons.pop(0)
    if keep.__class__ != OCDPersonProxy:
        keep.__class__ = OCDPersonProxy

    # loop over all the rest
    for i in persons:
        if i.__class__ != OCDPersonProxy:
            i.__class__ = OCDPersonProxy
        merge(keep, i)
        keep.refresh_from_db()

    dedupe_person_ids(keep)
    dedupe_person_candidacies(keep)
    keep.refresh_from_db()

    # make sure Person name is same as most recent candidate_name
    latest_candidate_name = keep.candidacies.latest(
        'contest__election__date', ).candidate_name
    if keep.name != latest_candidate_name:
        # move current Person.name into other_names
        keep.add_other_name(keep.name, 'Updated current name in merge')
        keep.name = latest_candidate_name
    keep.save()

    return keep
    def test_other_name_merge(self):
        person1 = Person.objects.create(name='Barack Obama')
        person2 = Person.objects.create(name='Barry Obama')

        merge(person1, person2)

        # moved name into other_names
        assert person1.other_names.get().name == 'Barry Obama'

        # check locked_fields
        obama = Person.objects.get()
        assert set(obama.locked_fields) == {'identifiers', 'name', 'other_names'}
    def test_merge_contact_details(self):
        person1 = Person.objects.create(name='Barack Obama')
        person2 = Person.objects.create(name='Barack Obama')
        person1.contact_details.create(type='fax', value='555-123-4567')
        person2.contact_details.create(type='fax', value='555-123-4567',
                                       note="Throw out your fax!")
        person1.contact_details.create(type='email', value='*****@*****.**')
        person2.contact_details.create(type='email', value='*****@*****.**')

        merge(person1, person2)
        obama = Person.objects.get()
        # no deduping for now
        self.assertEqual(obama.contact_details.count(), 4)
        assert set(obama.locked_fields) == {'identifiers', 'contact_details'}
    def test_merge_related_obj(self):
        person1 = Person.objects.create(name='Barack Obama')
        person2 = Person.objects.create(name='Barack Obama')
        d = Division.objects.create(id='ocd-division/country:us', name='US')
        j = Jurisdiction.objects.create(name='US', division=d)
        l = j.legislative_sessions.create(name='2015')
        b = Bill.objects.create(identifier='HB 1', legislative_session=l)
        sp = BillSponsorship.objects.create(bill=b, person=person2)

        merge(person1, person2)

        sp = BillSponsorship.objects.get()
        assert sp.person == person1
        person = Person.objects.get()
        assert set(person.locked_fields) == {'identifiers'}
Пример #6
0
    def merge_persons(self, filer_id):
        """
        Merge the Person objects that share the same CAL-ACCESS filer_id.

        Return the merged Person object.
        """
        persons = Person.objects.filter(
            identifiers__scheme='calaccess_filer_id',
            identifiers__identifier=filer_id,
        ).all()

        if self.verbosity > 2:
            self.log(
                "Merging {0} Persons sharing filer_id {1}".format(
                    len(persons),
                    filer_id,
                )
            )

        # each person will be merged into this one
        survivor = persons[0]

        # loop over all the rest of them
        for i in range(1, len(persons)):
            if survivor.id != persons[i].id:
                if (
                    survivor.name != persons[i].name or
                    survivor.sort_name != persons[i].sort_name
                ):
                    import ipdb; ipdb.set_trace() # noqa
                else:
                    merge(survivor, persons[i])

        # also delete the now duplicated PersonIdentifier objects
        if survivor.identifiers.count() > 1:
            for i in survivor.identifiers.filter(scheme='calaccess_filer_id')[1:]:
                i.delete()

        return survivor
    def test_merge_memberships(self):
        person1 = Person.objects.create(name='Barack Obama')
        person2 = Person.objects.create(name='Barack Obama')
        d = Division.objects.create(id='ocd-division/country:us', name='US')
        j = Jurisdiction.objects.create(name='US', division=d)
        dem = Organization.objects.create(name='Democratic', classification='party')
        gov = Organization.objects.create(name='Federal Government',
                                          jurisdiction=j,
                                          classification='government')
        # this isn't how you'd really use posts
        pres = Post.objects.create(label='President', organization=gov)
        sen = Post.objects.create(label='Senator', organization=gov)
        Membership.objects.create(organization=dem, person=person1)
        Membership.objects.create(organization=gov, person=person1, post=pres)
        Membership.objects.create(organization=dem, person=person2)
        Membership.objects.create(organization=gov, person=person1, post=sen)

        merge(person1, person2)
        person = Person.objects.get()
        # ensure that the party memberships are deduped and others are kept
        assert person.memberships.count() == 3
        assert pres.memberships.count() == 1
        assert sen.memberships.count() == 1
        assert set(person.locked_fields) == {'memberships', 'identifiers'}
    def test_no_self_merge(self):
        person1 = Person.objects.create(name='Barack Obama')

        with self.assertRaises(ValueError):
            merge(person1, person1)
    def merge(self, persons):
        """
        Merge items in persons iterable into one Person object.

        Return the merged Person object.
        """
        # each person will be merged into this one
        keep = persons.pop(0)

        # loop over all the rest
        for i in persons:
            merge(keep, i)
            keep.refresh_from_db()

        # also delete the now duplicated PersonIdentifier objects
        keep_filer_ids = keep.identifiers.filter(scheme='calaccess_filer_id')

        dupe_filer_ids = keep_filer_ids.values("identifier").annotate(
            row_count=Count('id'), ).order_by().filter(row_count__gt=1)

        for i in dupe_filer_ids.all():
            # delete all rows with that filer_id
            keep_filer_ids.filter(identifier=i['identifier']).delete()
            # then re-add the one
            keep.identifiers.create(
                scheme='calaccess_filer_id',
                identifier=i['identifier'],
            )

        # and dedupe candidacy records
        # first, make groups by contests with more than one candidacy
        contest_group_q = keep.candidacies.values("contest").annotate(
            row_count=Count('id')).filter(row_count__gt=1)

        # loop over each contest group
        for group in contest_group_q.all():
            cands = keep.candidacies.filter(contest=group['contest'])
            # preference to "qualified" candidacy (from scrape)
            if cands.filter(registration_status='qualified').exists():
                cand_to_keep = cands.filter(
                    registration_status='qualified').all()[0]
            # or the one with the most recent filed_date
            else:
                cand_to_keep = cands.latest('filed_date')

            # loop over all the other candidacies in the group
            for cand_to_discard in cands.exclude(id=cand_to_keep.id).all():
                # assuming the only thing in extras is form501_filing_ids
                if 'form501_filing_ids' in cand_to_discard.extras:
                    for i in cand_to_discard.extras['form501_filing_ids']:
                        self.link_form501_to_candidacy(i, cand_to_keep)
                cand_to_keep.refresh_from_db()

                if 'form501_filing_ids' in cand_to_keep.extras:
                    self.update_candidacy_from_form501s(cand_to_keep)
                cand_to_keep.refresh_from_db()

                # keep the candidate_name, if not already somewhere else
                if (cand_to_discard.candidate_name !=
                        cand_to_keep.candidate_name
                        and cand_to_discard.candidate_name !=
                        cand_to_keep.person.name
                        and not cand_to_keep.person.other_names.filter(
                            name=cand_to_discard.candidate_name).exists()):
                    keep.other_names.create(
                        name=cand_to_discard.candidate_name,
                        note='From merge of %s candidacies' %
                        cand_to_keep.contest)
                    cand_to_keep.refresh_from_db()

                # keep the candidacy sources
                if cand_to_discard.sources.exists():
                    for source in cand_to_discard.sources.all():
                        if not cand_to_keep.sources.filter(
                                url=source.url).exists():
                            cand_to_keep.sources.create(
                                url=source.url,
                                note=source.note,
                            )
                        cand_to_keep.refresh_from_db()

                # keep earliest filed_date
                if cand_to_keep.filed_date and cand_to_discard.filed_date:
                    if cand_to_keep.filed_date > cand_to_discard.filed_date:
                        cand_to_keep.filed_date = cand_to_discard.filed_date
                elif cand_to_discard.filed_date:
                    cand_to_keep.filed_date = cand_to_discard.filed_date
                # keep is_incumbent if True
                if not cand_to_keep.is_incumbent and cand_to_discard.is_incumbent:
                    cand_to_keep.is_incumbent = cand_to_discard.is_incumbent
                # assuming not trying to merge candidacies with different parties
                if not cand_to_keep.party and cand_to_discard.party:
                    cand_to_keep.party = cand_to_discard.party

                cand_to_keep.save()
                cand_to_discard.delete()

        keep.refresh_from_db()

        # make sure Person name is same as most recent candidate_name
        latest_candidate_name = keep.candidacies.latest(
            'contest__election__date', ).candidate_name
        if keep.name != latest_candidate_name:
            # move current Person.name into other_names
            if not keep.other_names.filter(name=keep.name).exists():
                keep.other_names.create(name=keep.name)
            keep.name = latest_candidate_name
        keep.save()

        return keep