Пример #1
0
class TestHerdCorral(unittest.TestCase):

    def setUp(self):
        data_path = os.path.join(os.path.dirname(__file__), 'profiles_100.json')
        with open(data_path, 'r') as data_file:
            population = tuple(json.load(data_file))
        records = [gen_record(profile) for profile in population]
        self.herd = Herd()
        self.herd.populate(records)

    def test_herd_corraling(self):
        self.herd.corral()
        for record in self.herd._population:
            self.assertIsInstance(record._blocks, tuple)
            self.assertTrue(1 <= len(record._blocks) <= 8)
            self.assertIsInstance(
                record._meta.forename_freq_ref, unicode
            )
            self.assertIsInstance(
                record._meta.mid_forename_freq_ref, unicode
            )
            self.assertIsInstance(
                record._meta.birth_surname_freq_ref, unicode
            )
            self.assertIsInstance(
                record._meta.current_surname_freq_ref, unicode
            )
Пример #2
0
class TestHerdProperties(unittest.TestCase):

    def setUp(self):
        data_path = os.path.join(os.path.dirname(__file__), 'profiles_100.json')
        with open(data_path, 'r') as data_file:
            population = tuple(json.load(data_file))
        records = [gen_record(profile) for profile in population]
        self.herd = Herd()
        self.herd.populate(records)

    def test_herd_str_method(self):
        try:
            herd = Herd()
            str(herd)
        except Exception as e:
            self.fail("Getting string of empty herd raised: {0}.".format(e))
        self.assertEqual(str(herd), '()')
        try:
            herd = self.herd
            str(herd)
        except Exception as e:
            self.fail("Getting string of herd raised: {0}.".format(e))
        self.assertNotEqual(str(herd), '()')

    def test_retrieving_populated_herd_size(self):
        self.assertEqual(self.herd.size, 100)

    def test_retrieving_herd_size_with_no_population(self):
        try:
            herd = Herd()
        except Exception as e:
            self.fail("Creating herd with no population raised: {0}.".format(e))
        self.assertEqual(herd.size, 0)
Пример #3
0
 def setUp(self):
     data_path = os.path.join(os.path.dirname(__file__), 'profiles_100.json')
     with open(data_path, 'r') as data_file:
         population = tuple(json.load(data_file))
     records = [gen_record(profile) for profile in population]
     self.herd = Herd()
     self.herd.populate(records)
Пример #4
0
 def setUp(self):
     population = (
         {
             'forename': 'Adelyn',
             'mid_forename': 'Heidenreich',
             'current_surname': 'Bartell',
             'birth_surname': 'Gerlach'
         },
         {
             'forename': 'John',
             'mid_forename': 'Frederich',
             'current_surname': 'Sanders'
         },
         {
             'forename': 'Joseph',
             'current_surname': 'Smith'
         },
         {
             'forename': 'John',
             'mid_forename': 'Heidenreich',
             'current_surname': 'Smith',
             'birth_surname': 'Gerlach'
         }
     )
     records = [gen_record(profile) for profile in population]
     self.herd = Herd()
     self.herd.populate(records)
     self.herd.corral()
Пример #5
0
class TestHerdFrequencyDictionaries(unittest.TestCase):

    def setUp(self):
        population = (
            {
                'forename': 'Adelyn',
                'mid_forename': 'Heidenreich',
                'current_surname': 'Bartell',
                'birth_surname': 'Gerlach'
            },
            {
                'forename': 'John',
                'mid_forename': 'Frederich',
                'current_surname': 'Sanders'
            },
            {
                'forename': 'Joseph',
                'current_surname': 'Smith'
            },
            {
                'forename': 'John',
                'mid_forename': 'Heidenreich',
                'current_surname': 'Smith',
                'birth_surname': 'Gerlach'
            }
        )
        records = [gen_record(profile) for profile in population]
        self.herd = Herd()
        self.herd.populate(records)
        self.herd.corral()

    def test_forename_freq_dict(self):
        self.assertEqual(self.herd._forename_freq_dict['J'], 3)
        self.assertEqual(self.herd._forename_freq_dict['H'], 2)
        self.assertEqual(self.herd._forename_freq_dict['j'], 0)
        self.assertEqual(len(self.herd._forename_freq_dict.keys()), 4)

    def test_surname_freq_dict(self):
        self.assertEqual(self.herd._surname_freq_dict['SM0'], 2)
        self.assertEqual(self.herd._surname_freq_dict['KRLK'], 2)
        self.assertEqual(self.herd._surname_freq_dict['smo'], 0)
        self.assertEqual(len(self.herd._surname_freq_dict.keys()), 4)
Пример #6
0
 def test_no_errors(self):
     """See how the probability matrix looks with no textual errors."""
     herd = Herd()
     data_path = os.path.join(os.path.dirname(__file__),
                              'error_free_3x2.json')
     with open(data_path, 'r') as data_file:
         population = tuple(json.load(data_file))
     records = [gen_record(profile) for profile in population]
     herd = Herd()
     herd.populate(records)
     herd.corral()
     print()
     print(herd.similarity_matrix)
Пример #7
0
 def test_gender_misclassification(self):
     """See how gender misclassification impacts probability matrix.
     """
     herd = Herd()
     data_path = os.path.join(os.path.dirname(__file__),
                              'gender_misclassification_3x2.json')
     with open(data_path, 'r') as data_file:
         population = tuple(json.load(data_file))
     records = [gen_record(profile) for profile in population]
     herd = Herd()
     herd.populate(records)
     herd.corral()
     print()
     print(herd.similarity_matrix)
Пример #8
0
 def test_character_transposition(self):
     """See how character transposition in someone's name impacts probability
     matrix.
     """
     herd = Herd()
     data_path = os.path.join(os.path.dirname(__file__),
                              'character_transposition_3x2.json')
     with open(data_path, 'r') as data_file:
         population = tuple(json.load(data_file))
     records = [gen_record(profile) for profile in population]
     herd = Herd()
     herd.populate(records)
     herd.corral()
     print()
     print(herd.similarity_matrix)
Пример #9
0
 def setUp(self):
     population = (
         {
             'forename': 'Adelyn',
             'mid_forename': 'Heidenreich',
             'current_surname': 'Bartell',
             'birth_surname': 'Gerlach',
             'address1': '448 Jones Street',
             'postal_code': '95786',
             'sex': 'F',
             'birth_year': '1977',
             'birth_month': '08',
             'birth_day': '27'
         },
         {
             'forename': 'Jane',
             'current_surname': 'Doe',
             'address1': '448 Jones Street',
             'postal_code': '95786',
             'sex': 'F',
             'birth_year': '1977',
             'birth_month': '08',
             'birth_day': '27'
         },
         {
             'forename': 'Adelyn',
             'current_surname': 'Bartell',
             'address1': '612 Johson Ave',
             'postal_code': '92436',
             'sex': 'F',
             'birth_year': '1977',
             'birth_month': '08',
             'birth_day': '27'
         }
     )
     records = [gen_record(profile) for profile in population]
     self.herd = Herd()
     self.herd.populate(records)
     self.herd.corral()
Пример #10
0
 def test_populating_herd(self):
     records = [gen_record(profile) for profile in self.profiles]
     herd = Herd()
     herd.populate(records)
     self.assertEqual(herd.size, 100)
Пример #11
0
class TestHerdSimilarityMatrix(unittest.TestCase):

    def setUp(self):
        population = (
            {
                'forename': 'Adelyn',
                'mid_forename': 'Heidenreich',
                'current_surname': 'Bartell',
                'birth_surname': 'Gerlach',
                'address1': '448 Jones Street',
                'postal_code': '95786',
                'sex': 'F',
                'birth_year': '1977',
                'birth_month': '08',
                'birth_day': '27'
            },
            {
                'forename': 'Jane',
                'current_surname': 'Doe',
                'address1': '448 Jones Street',
                'postal_code': '95786',
                'sex': 'F',
                'birth_year': '1977',
                'birth_month': '08',
                'birth_day': '27'
            },
            {
                'forename': 'Adelyn',
                'current_surname': 'Bartell',
                'address1': '612 Johson Ave',
                'postal_code': '92436',
                'sex': 'F',
                'birth_year': '1977',
                'birth_month': '08',
                'birth_day': '27'
            }
        )
        records = [gen_record(profile) for profile in population]
        self.herd = Herd()
        self.herd.populate(records)
        self.herd.corral()

    def test_similarity_matrix(self):
        test_similarity = np.array([
            [
                record_similarity(self.herd, self.herd._population[0],
                                  self.herd._population[0],
                                  damerau_levenshtein,
                                  damerau_levenshtein),
                0,
                record_similarity(self.herd, self.herd._population[0],
                                  self.herd._population[2],
                                  damerau_levenshtein,
                                  damerau_levenshtein)],
            [
                0,
                record_similarity(self.herd, self.herd._population[1],
                                  self.herd._population[1],
                                  damerau_levenshtein,
                                  damerau_levenshtein),
                0],
            [
                record_similarity(self.herd, self.herd._population[2],
                                  self.herd._population[0],
                                  damerau_levenshtein,
                                  damerau_levenshtein),
                0,
                record_similarity(self.herd, self.herd._population[2],
                                  self.herd._population[2],
                                  damerau_levenshtein,
                                  damerau_levenshtein)]],
            dtype=np.float32)
        self.assertTrue((test_similarity == self.herd.similarity_matrix).all())
Пример #12
0
 def setUp(self):
     population = (
         {
             'forename': 'Adelyn',
             'mid_forename': 'Heidenreich',
             'current_surname': 'Bartell',
             'birth_surname': 'Gerlach',
             'address1': '448 Jones Street',
             'postal_code': '95786',
             'sex': 'M',
             'national_id1': 'D599776',
             'birth_year': '1977',
             'birth_month': '08',
             'birth_day': '27'
         },
         {
             'forename': 'Adelyn',
             'mid_forename': 'Frederich',
             'current_surname': 'Gerlach'
         },
         {
             'forename': 'Joseph',
             'current_surname': 'Smith',
             'address1': '448 Jones Street',
             'postal_code': '95786',
             'sex': 'M',
             'national_id1': 'D599776',
             'birth_year': '1977',
             'birth_month': '08',
             'birth_day': '27'
         },
         {
             'forename': 'John',
             'mid_forename': 'Heidenreich',
             'current_surname': 'Smith',
             'birth_surname': 'Gerlach',
             'address1': '448 Jones Avenue',
             'postal_code': '97856',
             'sex': 'F',
             'national_id1': 'D599886',
             'birth_year': '1986',
             'birth_month': '10',
             'birth_day': '27'
         },
         {
             'forename': 'Jason',
             'current_surname': 'Sanders',
             'address1': '448 Jones Street',
             'address2': 'Apt. A',
             'postal_code': '97586',
             'sex': 'F',
             'national_id1': 'D597976',
             'birth_year': '1977',
             'birth_month': '10',
             'birth_day': '27'
         }
     )
     records = [gen_record(profile) for profile in population]
     self.herd = Herd()
     self.herd.populate(records)
     self.herd.corral()
Пример #13
0
class TestMeasuresSimilarityFunctions(unittest.TestCase):

    def setUp(self):
        population = (
            {
                'forename': 'Adelyn',
                'mid_forename': 'Heidenreich',
                'current_surname': 'Bartell',
                'birth_surname': 'Gerlach',
                'address1': '448 Jones Street',
                'postal_code': '95786',
                'sex': 'M',
                'national_id1': 'D599776',
                'birth_year': '1977',
                'birth_month': '08',
                'birth_day': '27'
            },
            {
                'forename': 'Adelyn',
                'mid_forename': 'Frederich',
                'current_surname': 'Gerlach'
            },
            {
                'forename': 'Joseph',
                'current_surname': 'Smith',
                'address1': '448 Jones Street',
                'postal_code': '95786',
                'sex': 'M',
                'national_id1': 'D599776',
                'birth_year': '1977',
                'birth_month': '08',
                'birth_day': '27'
            },
            {
                'forename': 'John',
                'mid_forename': 'Heidenreich',
                'current_surname': 'Smith',
                'birth_surname': 'Gerlach',
                'address1': '448 Jones Avenue',
                'postal_code': '97856',
                'sex': 'F',
                'national_id1': 'D599886',
                'birth_year': '1986',
                'birth_month': '10',
                'birth_day': '27'
            },
            {
                'forename': 'Jason',
                'current_surname': 'Sanders',
                'address1': '448 Jones Street',
                'address2': 'Apt. A',
                'postal_code': '97586',
                'sex': 'F',
                'national_id1': 'D597976',
                'birth_year': '1977',
                'birth_month': '10',
                'birth_day': '27'
            }
        )
        records = [gen_record(profile) for profile in population]
        self.herd = Herd()
        self.herd.populate(records)
        self.herd.corral()

    def test_extract_forename_info(self):
        record = self.herd._population[0]
        forename, weight = extract_forename_similarity_info(self.herd,
                                                            record,
                                                            'fore')
        self.assertEqual(forename, 'adelyn')
        self.assertEqual(weight, 0.25)
        forename, weight = extract_forename_similarity_info(self.herd,
                                                            record,
                                                            'mid_fore')
        self.assertEqual(forename, 'heidenreich')
        self.assertEqual(weight, 0.25)
        record = self.herd._population[4]
        forename, weight = extract_forename_similarity_info(self.herd,
                                                            record,
                                                            'fore')
        self.assertEqual(forename, 'jason')
        self.assertEqual(weight, 0.375)
        forename, weight = extract_forename_similarity_info(self.herd,
                                                            record,
                                                            'mid_fore')
        self.assertEqual(forename, '')
        self.assertEqual(weight, 0.0)

    def test_extract_surname_info(self):
        record = self.herd._population[0]
        surname, weight = extract_surname_similarity_info(self.herd,
                                                          record,
                                                          'birth')
        self.assertEqual(surname, 'gerlach')
        self.assertEqual(round(weight, 5), 0.42857)
        surname, weight = extract_surname_similarity_info(self.herd,
                                                          record,
                                                          'current')
        self.assertEqual(surname, 'bartell')
        self.assertEqual(round(weight, 5), 0.14286)
        record = self.herd._population[4]
        surname, weight = extract_surname_similarity_info(self.herd,
                                                          record,
                                                          'birth')
        self.assertEqual(surname, '')
        self.assertEqual(weight, 0.0)
        surname, weight = extract_surname_similarity_info(self.herd,
                                                          record,
                                                          'current')
        self.assertEqual(surname, 'sanders')
        self.assertEqual(round(weight, 5), 0.14286)

    def test_forename_similarity(self):
        records = [self.herd._population[0], self.herd._population[4]]
        weight = get_forename_similarity(self.herd,
                                         records,
                                         damerau_levenshtein,
                                         "fore")
        self.assertEqual(weight, (-4.0, 6))
        weight = get_forename_similarity(self.herd,
                                         records,
                                         damerau_levenshtein,
                                         "mid_fore")
        self.assertEqual((round(weight[0], 5), weight[1]), (-4.90909, 6))
        records = [self.herd._population[4], self.herd._population[0]]
        weight = get_forename_similarity(self.herd,
                                         records,
                                         damerau_levenshtein,
                                         "mid_fore")
        self.assertEqual(weight, (0, 6))
        records = [self.herd._population[0], self.herd._population[3]]
        weight = get_forename_similarity(self.herd,
                                         records,
                                         damerau_levenshtein,
                                         "mid_fore")
        self.assertEqual(weight, (6.0, 6))

    def test_surname_similarity(self):
        records = [self.herd._population[0], self.herd._population[4]]
        weight = get_surname_similarity(self.herd,
                                        records,
                                        damerau_levenshtein,
                                        "current")
        self.assertEqual((round(weight[0], 5), weight[1]), (-5.14286, 12))
        weight = get_surname_similarity(self.herd,
                                        records,
                                        damerau_levenshtein,
                                        "birth")
        self.assertEqual(weight, (-12.0, 12))
        records = [self.herd._population[4], self.herd._population[0]]
        weight = get_surname_similarity(self.herd,
                                        records,
                                        damerau_levenshtein,
                                        "birth")
        self.assertEqual(weight, (0, 12))
        records = [self.herd._population[0], self.herd._population[1]]
        weight = get_surname_similarity(self.herd,
                                        records,
                                        damerau_levenshtein,
                                        "birth")
        self.assertEqual(weight, (12.0, 12))

    def test_address_similarity(self):
        records = [self.herd._population[0], self.herd._population[1]]
        weight = get_address_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 0)
        records = [self.herd._population[0], self.herd._population[3]]
        weight = get_address_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 2)
        records = [self.herd._population[0], self.herd._population[4]]
        weight = get_address_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 7)
        records = [self.herd._population[0], self.herd._population[2]]
        weight = get_address_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 7)

    def test_clean_address(self):
        address = u'448 Jones Street'.lower()
        cleaned_address = clean_address(address)
        self.assertEqual(cleaned_address, u'448 jones st')
        address = u'448 Jones Avenue, Apartment 2'.lower()
        cleaned_address = clean_address(address)
        self.assertEqual(cleaned_address, u'448 jones ave apt 2')
        address = u'448-A Stromme Strt;; Building G'.lower()
        cleaned_address = clean_address(address)
        self.assertEqual(cleaned_address, u'448 a stromme st bldg g')
        self.assertEqual(clean_address(u''), u'')
        self.assertEqual(clean_address(u' '), u'')

    def test_post_code_similarity(self):
        records = [self.herd._population[0], self.herd._population[1]]
        weight = get_post_code_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 0)
        records = [self.herd._population[0], self.herd._population[3]]
        weight = get_post_code_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 0)
        records = [self.herd._population[0], self.herd._population[4]]
        weight = get_post_code_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 1)
        records = [self.herd._population[0], self.herd._population[2]]
        weight = get_post_code_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 4)

    def test_sex_similarity(self):
        records = [self.herd._population[0], self.herd._population[1]]
        weight = get_sex_similarity(records)
        self.assertEqual(weight, -10)
        records = [self.herd._population[0], self.herd._population[3]]
        weight = get_sex_similarity(records)
        self.assertEqual(weight, -10)
        records = [self.herd._population[0], self.herd._population[2]]
        weight = get_sex_similarity(records)
        self.assertEqual(weight, 1)

    def test_dob_similarity(self):
        records = [self.herd._population[0], self.herd._population[1]]
        weight = get_dob_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 0)
        records = [self.herd._population[0], self.herd._population[2]]
        weight = get_dob_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 14)
        records = [self.herd._population[0], self.herd._population[3]]
        weight = get_dob_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, -4.5)
        records = [self.herd._population[0], self.herd._population[4]]
        weight = get_dob_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 4.75)

    def test_id_similarity(self):
        records = [self.herd._population[0], self.herd._population[1]]
        weight = get_id_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 0)
        records = [self.herd._population[0], self.herd._population[3]]
        weight = get_id_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 0)
        records = [self.herd._population[0], self.herd._population[4]]
        weight = get_id_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 2)
        records = [self.herd._population[0], self.herd._population[2]]
        weight = get_id_similarity(records, damerau_levenshtein)
        self.assertEqual(weight, 7)

    def test_record_similarity(self):
        records = [self.herd._population[0], self.herd._population[1]]
        weight = record_similarity(self.herd,
                                   records[0],
                                   records[1],
                                   damerau_levenshtein,
                                   damerau_levenshtein)
        self.assertEqual(round(weight, 5), -0.00038)
        records = [self.herd._population[0], self.herd._population[2]]
        weight = record_similarity(self.herd,
                                   records[0],
                                   records[1],
                                   damerau_levenshtein,
                                   damerau_levenshtein)
        self.assertEqual(round(weight, 5), 0.08752)
        records = [self.herd._population[0], self.herd._population[3]]
        weight = record_similarity(self.herd,
                                   records[0],
                                   records[1],
                                   damerau_levenshtein,
                                   damerau_levenshtein)
        self.assertEqual(round(weight, 5), -0.10248)
        records = [self.herd._population[0], self.herd._population[4]]
        weight = record_similarity(self.herd,
                                   records[0],
                                   records[1],
                                   damerau_levenshtein,
                                   damerau_levenshtein)
        self.assertEqual(round(weight, 5), -0.30872)
        records = [self.herd._population[0], self.herd._population[0]]
        weight = record_similarity(self.herd,
                                   records[0],
                                   records[1],
                                   damerau_levenshtein,
                                   damerau_levenshtein)
        self.assertEqual(round(weight, 5), 1.0)