def test_removal_of_bios(self):
        # one of the biographies of person1 has been removed
        # 1. the biography should not be there anymore
        # 2. the bioport_id should still exist
        # 3. if we have no more biographies for this person, the person should disappear
        repo = self.repo
        url1 = os.path.abspath(os.path.join(THIS_DIR, 'data/knaw/list.xml'))
        url2 = os.path.abspath(os.path.join(THIS_DIR, 'data/knaw_changed/list.xml'))

        src = Source(id=u'test', url=url1, description='knaw test dinges')
        repo.add_source(src)

        repo.download_biographies(src)

        # we should have 5 persons from this source
        self.assertEqual(len(repo.get_persons(source_id=src.id)), 5)
        # one of them has id test/002
        person2 = repo.get_biography(local_id='test/002').get_person()

        bioport_id3 = person2.bioport_id

        # now we change the biographies that are available in some of the sources
        src.url = url2
        repo.download_biographies(src)

        # now the biography of person2 does not exist anymore
        self.assertEqual(len(list(repo.get_biographies(local_id='test/002'))), 0)
        # and neither does person2 himself
        assert person2.bioport_id not in [p.bioport_id for p in repo.get_persons()], [(p.bioport_id, p.get_biographies()) for p in repo.get_persons()]

        # we still, however, should have the bioport_id in our repository
        self.assertEqual(repo.db.get_bioport_id(biography_id='test/002'), person2.bioport_id)

        # now we re-download the old data
        src.url = url1
        repo.download_biographies(src)
        # and we have the 5 old persons again
        self.assertEqual(len(repo.get_persons(source_id=src.id)), 5)
        # and 002 is back
        self.assertEqual(len(list(repo.get_biographies(local_id='test/002'))), 1)

        # it is very important that our newly re-donloaed test/002 has the same bioport id as before
        self.assertEqual(repo.get_biography(local_id='test/002').get_person().bioport_id, bioport_id3)

        # now, we do the same exercise of removing and re-adding, but this time we identify test/002 with test/005
        person5 = repo.get_biography(local_id='test/005').get_person()
        new_person = repo.identify(person5, person2)

        # we have now one person less
        self.assertEqual(len(repo.get_persons(source_id=src.id)), 4)
        self.assertEqual(len(new_person.get_biographies()), 2)

        src.url = url2
        repo.download_biographies(src)
        self.assertEqual(len(list(repo.get_biographies(source_id=src.id))), 5)
        self.assertEqual(len(repo.get_persons(source_id=src.id)), 5, [p.get_biographies() for p in repo.get_persons(source_id=src.id)])
        new_person = repo.get_person(bioport_id=new_person.bioport_id)
        self.assertEqual(len(new_person.get_biographies()), 1)
    def test_download_changed_bios(self):
        repo = self.repo
        url = os.path.abspath(os.path.join(THIS_DIR, 'data/knaw/list.xml'))

        # add some sources
        SOURCE_ID = u'test1'
        BASE = 10
        src = Source(id=SOURCE_ID, url=url, description='knaw test dinges')
        repo.add_source(src)
        repo.download_biographies(src)
        self.assertEqual(len(repo.get_persons(source_id=SOURCE_ID)), 5)
        self.assertEqual(len(repo.get_persons()), BASE + 5)

        old_persons = [repo.get_person(bioport_id) for bioport_id in repo.get_bioport_ids()]
        self.assertFalse([x for x in old_persons if x.status == '1'])

        some_person = old_persons[0]
        for person in old_persons:
            # TODO: check why sometimes some_person.status is a string
            self.assertEqual(person.status, STATUS_NEW)
            person.record.status = STATUS_DIFFICULT
            person.save()

        # if we download the information at our source, nothing should have changed
        repo.download_illustrations(src)
        self.assertEqual(len(repo.get_persons(source_id=SOURCE_ID)), 5)
        self.assertEqual(len(repo.get_persons()), BASE + 5)
        self.assertEqual(set([p.bioport_id for p in old_persons]), set([p.bioport_id for p in repo.get_persons()]))

        # now we donwload the biographies a second time, and again nothing should ahve changed
        repo.download_biographies(src)
        self.assertEqual(len(repo.get_persons(source_id=SOURCE_ID)), 5)
        self.assertEqual(len(repo.get_persons()), BASE + 5)
        self.assertEqual(set([p.bioport_id for p in old_persons]), set([p.bioport_id for p in repo.get_persons()]))

        # also the status should remain the same
        self.assertEqual(STATUS_DIFFICULT, repo.get_person(bioport_id=some_person.bioport_id).status)

        # now we change the biographies that are available in some of the sources, and download again
        url2 = os.path.abspath(os.path.join(THIS_DIR, 'data/knaw_changed/list.xml'))
        src.url = url2
        repo.download_biographies(src)
        # the new url has one biograrphy less, and one new one.
        # we remove the one that has disappeared, so the number of persons should now be the same as it was previously
        # 1 has remained exactly the same
        # 2 has disappeared
        # 3 has changed location - it is found in 006.xml
        # 4 has changed name
        # 5 has changed location, but should have the same data
        # 7 is a new entry
        self.assertEqual(len(repo.get_persons(source_id=SOURCE_ID)), 5,)
        # all of the old bioport_ids should still be available
        for p in old_persons:
            assert p.bioport_id in repo.get_bioport_ids()

        # create a dictionary with the persons from our test-source, accessabilble from their ID
        persons = {}
        for person in repo.get_persons(source_id=SOURCE_ID):
            knaw_bio = None
            for bio in person.get_biographies():
                if bio.get_source().id == SOURCE_ID:
                    knaw_bio = bio
            person.knaw_bio = knaw_bio
            bio_id = knaw_bio.get_id().split('/')[1]
            for i in range(1, 10):
                if bio_id == '00%s' % i:
                    persons[i] = person
        # person 2 has disappeared
        self.assertTrue(2 not in persons, persons)
        # 3 has changed location - it is found in 006.xml.
        assert persons[3].knaw_bio.source_url.endswith('006.xml'), persons[3].knaw_bio.source_url
        # even in a different location, its status remains the same
        self.assertEqual(persons[3].status, STATUS_DIFFICULT)

        # person 4 has a changed name
        assert 'changed' in persons[4].name(), persons[4].name()
        # even if his name has changes, its status remains the same
        self.assertEqual(persons[4].status, STATUS_DIFFICULT)
        # person 5 has changed location
        assert persons[5].knaw_bio.source_url.endswith('005a.xml'), persons[5].knaw_bio.source_url
        self.assertEqual(persons[5].status, STATUS_DIFFICULT)

        # person 7 is a new entry
        self.assertEqual(persons[7].status, STATUS_NEW)