예제 #1
0
    def testCorrectionLists(
            self):  # - - - - - - - - - - - - - - - - - - - - - -
        """Test correction lists"""

        for f in self.correction_list_files:
            corr_list = lookup.CorrectionList(descr=f)

            assert (corr_list.description == f), \
                   'Correction list "'+f+'" does not have correct description: "'+ \
                   corr_list.description+'"'

            assert (corr_list.file_name == ''), \
                   'Correction list "'+f+'" has non-empty file name: '+ \
                   str(lookup_table.file_name)

            assert (isinstance(corr_list,list)), \
                   'Correction list "'+f+'" is not a list'

            corr_list.load(f)  # Load the table

            assert (len(corr_list) > 0), \
                   'Correction list "'+f+'" is empty after load'

            elem_len = 9999  # Long elements first
            old_key = ''

            for (key, value) in corr_list:
                assert (isinstance(key,str)), \
                       'Key in correction list "'+f+'" is not a string: '+str(key)
                assert (isinstance(value,str)), \
                       'Value in correction list "'+f+'" is not a string: '+str(value)

                assert (len(key) <= elem_len), \
                       'Correction list "'+f+'" element key "'+value+ \
                       '" is longer than previous key: "'+old_key+'"'

                old_key = key
                elem_len = len(key)
예제 #2
0
  def setUp(self):

    self.in_ds = dataset.DataSetCSV(descri='A standardisation test data set',
                                    access_mode='read',
                                    rec_ident = 'rec_id',
                                    field_list = [],
                                    header_line=True,
                                    write_header=True,
                                    file_name='test-standard-dataset.csv')

    self.out_ds = dataset.DataSetCSV(descrip='The standardised test data set',
                                    access_mode='write',
                                    rec_ident = 'rec_id',
                                    field_list = [('day',0),
                                                  ('month',1),
                                                  ('year',2),
                                                  ('country_code',3),
                                                  ('country_name',4),
                                                  ('area_code',5),
                                                  ('number',6),
                                                  ('extension',7),
                                                  ('title',8),
                                                  ('gender_guess',9),
                                                  ('given_name',10),
                                                  ('alt_given_name',11),
                                                  ('surname',12),
                                                  ('alt_surname',13),
                                                  ('out_pass1',14),
                                                  ('out_pass2',15)],
                                    header_line=True,
                                    write_header=True,
                                    file_name='test-standardised-dataset.csv')


    self.dates = [['Sep 1, 68',        ['1', '9', '1968']],
                  ['18 Jan 2002',      ['18','1', '2002']],
                  ['17:2:2002',        ['17','2', '2002']],
                  ['2002-02-25',       ['25','2', '2002']],
                  ['18,03,2001',       ['18','3', '2001']],
                  ['21.12.1999',       ['21','12','1999']],
                  ['February 18,19',   ['18','2', '1919']],
                  ['23\\July\\1968',   ['23','7', '1968']],
                  ['18-02-2002',       ['18','2', '2002']],
                  ['5/03/01',          ['5', '3', '2001']],
                  ['19680429',         ['29','4', '1968']],
                  ['600810',           ['10','8', '1960']],
                  ['3:05:2000',        ['3', '5', '2000']],
                  ['30.11.1989',       ['30','11','1989']],
                  ["1. January '70",   ['1', '1', '1970']],
                  ['01011970',         ['1', '1', '1970']],
                  ['10011970',         ['10','1', '1970']],
                  ['31 dec  1969',     ['31','12','1969']],
                  ['30 december  69',  ['30','12','1969']],
                  ['01011970',         ['1', '1', '1970']],
                  ['13 Feb 1945',      ['13','2', '1945']],
                  ['Feb 13, \'45',     ['13','2', '1945']],
                  ['April 29 1968',    ['29','4', '1968']],
                  ['29-4=68',          ['29','4', '1968']],
                  ['11-01-1972',       ['11','1', '1972']],
                  ['January 10. 1972', ['10','1', '1972']],
                  ['29 Feb 1932',      ['29','2', '1932']],
                  ['29 Feb 32',        ['29','2', '1932']],
                  ['11 Jun 1902',      ['11','6', '1902']],
                  ['11 Jul 1989',      ['11','7', '1989']],
                  ['12111968',         ['12','11','1968']],
                  ['      21111969  ', ['21','11','1969']]]

    self.date_parse_formats = ['%d %m %Y',   # 24 04 2002  or  24 4 2002
                               '%d %B %Y',   # 24 April 2002
                               '%d %b %Y',   # 24 Apr 2002
                               '%m %d %Y',   # 04 24 2002  or  4 24 2002
                               '%B %d %Y',   # April 24 2002
                               '%b %d %Y',   # Apr 24 2002
                               '%Y %m %d',   # 2002 04 24  or  2002 4 24
                               '%Y %B %d',   # 2002 April 24
                               '%Y %b %d',   # 2002 Apr 24
                               '%d %m %y',   # 24 04 02    or  24 4 02
                               '%d %B %y',   # 24 April 02
                               '%d %b %y',   # 24 Apr 02
                               '%y %m %d',   # 02 04 24    or  02 4 24
                               '%y %B %d',   # 02 April 24
                               '%y %b %d',   # 02 Apr 24
                               '%m %d %y',   # 04 24 02    or  4 24 02
                               '%B %d %y',   # April 24 02
                               '%b %d %y']   # Apr 24 02

    self.phonenums = \
      [('++61 2 6125 5690',        ['61', 'Australia', '02', '6125-5690', '']),
       ('0061 02 6125 5690',       ['61', 'Australia', '02', '6125-5690', '']),
       ('0061   02    6125-5690',  ['61', 'Australia', '02', '6125-5690', '']),
       ('41 312 17 84',            ['41', 'Switzerland', '', '312 17 84', '']),
       ('6125 0010',               ['61', 'Australia', '', '6125-0010', '']),
       ('1-800-764-0432',          ['1', 'USA/Canada', '800', '764-0432', '']),
       ('02 6125 0010',            ['61', 'Australia', '02', '6125-0010', '']),
       ('00 1 317-923 4523',       ['1', 'USA/Canada', '317', '923-4523', '']),
       ('1 317-923 4523',          ['1', 'USA/Canada', '317', '923-4523', '']),
       ('00111 41 312 17 84',      ['41', 'Switzerland', '', '312 17 84', '']),
       ('00001 41 312 17 84',      ['41', 'Switzerland', '', '312 17 84', '']),
       ('01 41 312 17 84',         ['41', 'Switzerland', '', '312 17 84', '']),
       ('1-541-754-3010',          ['1', 'USA/Canada', '541', '754-3010', '']),
       ('754-3010',                ['1', 'USA/Canada',   '', '754-3010', '']),
       ('754-3010ext 42',          ['1', 'USA/Canada',   '', '754-3010','42']),
       ('754-3010x 42',            ['1', 'USA/Canada',   '', '754-3010','42']),
       ('754-3010 ext 42',         ['1', 'USA/Canada',   '', '754-3010','42']),
       ('754-3010 ext. 42',        ['1', 'USA/Canada',   '', '754-3010','42']),
       ('754-3010 x. 42',          ['1', 'USA/Canada',   '', '754-3010','42']),
       ('754-3010 x42',            ['1', 'USA/Canada',   '', '754-3010','42']),
       ('(541) 754-3010',          ['1', 'USA/Canada', '541', '754-3010', '']),
       ('+1-541-754-3010',         ['1', 'USA/Canada', '541', '754-3010', '']),
       ('191 541 754 3010',        ['', '', '', '915417543010', '']),
       ('001-541-754-3010',        ['1', 'USA/Canada', '541', '754-3010', '']),
       ('636-48018',               ['61', 'Australia', '', '6364-8018', '']),
       ('(089) / 636-48018',       ['1', 'USA/Canada', '896', '364-8018', '']),
       ('+49-89-636-48018',        ['49', 'Germany', '', '89 636 48018', '']),
       ('19-49-89-636-48018',      ['', '', '', '9498963648018', '']),
       ('+61 (02) 6125 0101',      ['61', 'Australia', '02', '6125-0101', '']),
       ('++61 (02) 6125 0101',     ['61', 'Australia', '02', '6125-0101', '']),
       ('++61 (2) 6125 0101',      ['61', 'Australia', '02', '6125-0101', '']),
       ('11 +61 (2) 6125 0101',    ['', '', '', '161261250101', '']),
       ('0011 ++61 (2) 6125 0101', ['61', 'Australia', '02', '6125-0101', '']),
       ('0111 ++61 (2) 6125 0101', ['61', 'Australia', '02', '6125-0101', '']),
       ('0111 61 02 6125 0101',    ['61', 'Australia', '02', '6125-0101', '']),
       ('61 (2) 6125 0101',        ['61', 'Australia', '02', '6125-0101', ''])]

    # Names with given names first
    #
    self.names_gnames = \
      [('',                        ['','','','','','']),
      ('Peter Christen',           ['male', '','peter', '','christen', '']),
      ('"DR" Peter Christen',      ['male', 'dr', 'peter', '','christen', '']),
      ('<mr> Peter Christen',      ['male', 'mr', 'peter', '','christen', '']),
      ('{ Dr > Peter Christen',    ['male', 'dr', 'peter', '','christen', '']),
      (' " Dr Peter Christen',     ['male', 'dr', 'peter', '','christen', '']),
      ('Peter () Christen',        ['male', 'dr', 'peter', '','christen', '']),
      ('Peter Christen(DR]]',      ['male', 'dr', 'peter', '','christen', '']),
      ('Peter Christen (mister',   ['male', 'mr', 'peter', '','christen', '']),
      ('Peter Christen " mr',      ['male', 'mr', 'peter', '','christen', '']),
      ('Peter Christen {mr } ',    ['male', 'mr', 'peter', '','christen', '']),
      ('Peter Christen "dr"',      ['male', 'dr', 'peter', '','christen', '']),
      (' ( ) Peter Christen',      ['male', '','peter', '','christen', '']),
      ('Peter " " Christen',       ['male', '','peter', '','christen', '']),
      ('Peter (> Christen',        ['male', '','peter', '','christen', '']),
      (',Peter Christen--',        ['male', '','peter', '','christen', '']),
      ('-,- Peter Christen-,-',    ['male', '','peter', '','christen', '']),
      (' //  Peter Christen//',    ['male', '','peter', '','christen', '']),
      ('(Peter,Christen  )  ',     ['male', '','peter', '','christen', '']),
      ('[Peter   Christen]',       ['male', '','peter', '','christen', '']),
      ('<<Peter ,  Christen>>',    ['male', '','peter', '','christen', '']),
      ('{  Peter Christen }',      ['male', '','peter', '','christen', '']),
      ('"Peter Christen"',         ['male', '','peter', '','christen', '']),
      ("''Peter ; Christen''",     ['male', '','peter', '','christen', '']),
      ("'|Peter ?: Christen'|",    ['male', '','peter', '','christen', '']),
      ('Mr peter Christen',        ['male','mr','peter', '','christen', '']),
      ('Mister Peter CHRISTEN',    ['male','mr','peter', '','christen', '']),
      ('Petra~ Christen',          ['female', '','petra', '','christen', '']),
      ('Ms petra Christen',        ['female','ms','petra', '','christen', '']),
      ('Misses Petra CHRISTEN',    ['female','ms','petra', '','christen', '']),
      ('Peter Marco Jones',        ['male','','peter','mark','jones','']),
      ('peter almond',             ['male','','peter','','almond','']),
      ('almond peter',             ['male','','peter','','almond','']),
      ('Peter',                    ['male','','peter','','','']),
      ('alison de francesco',      ['','','','','','']),
      ('alison de-francesco',      ['','','','','','']),
      ('peter de la placa',        ['','','','','','']),
      ('peter marco de la placa',  ['','','','','','']),
      ('maria petra de la placa-miller', ['','','','','','']),
      ('maria petra vonder felde', ['','','','','','']),
      ('Christen',                 ['','','','','christen','']),
      ('Jane',                     ['female','','jane','','','']),
      ('miss anita',               ['female','ms','anita','','','']),
      ('mr p. christen',           ['male','mr','p','','christen','']),
      ('Peter mary jones',         ['','','peter','mary','jones','']),
      ('mr Peter mary jones',      ['male','mr','peter','mary','jones','']),
      ('mister Paul PETER jones-miller',
                                ['male','mr','paul','peter','jones','miller']),
      ('peter known as pete',      ['male','','peter','peter','','']),
      ('nee miller',               ['','','nee','','miller','']),
      ('peter de nee',             ['','','peter','','de nee','']),
      ('paul saint nee',           ['','','paul','','saint nee','']),
      ('saint paul nee',           ['','','saint paul','','nee','']),
      ('paula miller (nee jones)', ['','','','','','']),
      ('peter, son of nee miller', ['','','','','','']),
      ('peter (known as  pete) christen',  ['','','','','','']),
      ('peter (known as "pete") christen', ['','','','','','']),
      ('peter christen miller',       ['','','','','','']),
      ('peter christen-miller',       ['','','','','','']),
      ('peter joe christen-miller',       ['','','','','','']),
      ("peter 'joe' christen-miller",       ['','','','','','']),
      ('"sharky" peter miller',       ['','','','','','']),
      ("'barbie' sue smith-jones",       ['','','','','','']),
      ('sue "barbie" smith meyer',       ['','','','','','']),
      ('sue known as "barbie" smith meyer',       ['','','','','','']),
      ("sue 'barbie' smith-jones",       ['','','','','','']),
      ("sue 'barbie' smith jones",       ['','','','','','']),
      ('sue baby of maria jones',       ['','','','','','']),

      ('jane co lo-schiavo',       ['','','','','','']),
      ('martina louis barber',       ['','','','','','']),
      ('lisa-anne hennessy',       ['','','','','','']),
      ('michelle southam-byrnes',       ['','','','','','']),
      ('nicole win jordan',       ['','','','','','']),
      ('caroline and clarke',       ['','','','','','']),
      ('jocelyn or buskens',       ['','','','','','']),
      ('yee fung nee cheng',       ['','','','','','']),
      ('jenny khaw nee yii',       ['','','','','','']),
      ('roslyn kay sta maria',       ['','','','','','']),
      ('shelley lee di stefano',       ['','','','','','']),
      ('li qing van huisstede',       ['','','','','','']),
      ('patricia ann van den hurk',       ['','','','','','']),
      ('kim maree nguyen su',       ['','','','','','']),
      ('adriana haile de lange',       ['','','','','','']),
      ("jodene akke op't land",       ['','','','','','']),
      ('cleo ann di blasio',       ['','','','','','']),
      ('debbie saphire st quintin',       ['','','','','','']),
      ('nehmat e el chaar',       ['','','','','','']),
      ('yan chen ping yang',       ['','','','','','']),
      ('sharon leoni van ant werpen',       ['','','','','','']),
      ('nicole maria de oliveira',       ['','','','','','']),
      ('sonia denni de arman',       ['','','','','','']),
      ('nicole dan de arman',       ['','','','','','']),
      ('johdy louise dal santo',       ['','','','','','']),
      ('tamara lou st. john-morton',       ['','','','','','']),
      ('mercy jacq john peter',       ['','','','','','']),
      ('carly evelyn de st germain',       ['','','','','','']),
      ('rachael jane van buuren',       ['','','','','','']),
      ('joanna lilli van ryswyk',       ['','','','','','']),
      ('melissa ma romijn-van stey',       ['','','','','','']),
      ('wong jing ling huang',       ['','','','','','']),
      ('julie  maree mackenzie - hun',       ['','','','','','']),
      ('joanne agnes righettli (dr)',       ['','dr','','','','']),
      ('siu har ng (hung)',       ['','','','','','']),
      ('anne-maree lawrence-franks',       ['','','','','','']),
      ('mao-yao rong-fong',       ['','','','','','']),
      ('wai-fun wheeler-smith',       ['','','','','','']),
      ('lee-anne westerbrook-sim',       ['','','','','','']),
      ('kasey-lee so-chan',       ['','','','','','']),
      ('sherri-anne hilder-penningt',       ['','','','','','']),
      ('yoon-sun ahn-wu',       ['','','','','','']),
      ('ying-xia yu-guo',       ['','','','','','']),
      ('hee-jing hyde-page',       ['','','','','','']),
      ('mary-anne chung-kwon',       ['','','','','','']),
      ('marie-reine attallah-boulos',       ['','','','','','']),
      ('tracy-lea zanco-hinds',       ['','','','','','']),
      ('tracy-maria beardow-brooks',       ['','','','','','']),
      ('el-masri sheehan-hill',       ['','','','','','']),
      ('vicki-maree cheryle-anne',       ['','','','','','']),
      ('vicki-mare sheehan-anna',       ['','','','','','']),
      ('cindy-lou mckie-bailey',       ['','','','','','']),
      ('jo-ann bakoss-parson',       ['','','','','','']),
      ('wan-ching tsui-chan',       ['','','','','','']),
      ('sue-ellen bruechert-reich',       ['','','','','','']),
      ('anna-marie vearing-brown',       ['','','','','','']),
      ("lisa-jane o'connor",       ['','','','','','']),
      ("julie-anne o'malley",       ['','','','','','']),
      ("mary-jane o'doherty",       ['','','','','','']),
      ("jose-carol o'leary",       ['','','','','','']),
      ("rose-merrie o'kane",       ['','','','','','']),
      ("ymeka-emily o'neill",       ['','','','','','']),
                        ]

## check field spill - have 2 input fields


    # Names with surnames first
    #
    self.names_snames = [('Christen Peter',
                          ['male', '','peter', '','christen', '']),
                         ('Christen, Peter',
                          ['male', '','peter', '','christen', '']),
                         ('Mr Christen Peter',
                          ['male','mr','peter', '','christen', '']),
                         ('Mister CHRISTEN, Peter',
                          ['male','mr','peter', '','christen', '']),
                         ('Christen Petra',
                          ['female', '','petra', '','christen', '']),
                         ('Ms Christen, petra',
                          ['female','ms','petra', '','christen', '']),
                         ('Misses CHRISTEN, PETRA',
                          ['female','ms','petra', '','christen', '']),
                         ('peter almond',
                          ['male','','peter','','almond','']),
                         ('almond peter',
                          ['male','','peter','','almond','']),
                         ('',
                          ['','','','','','']),
                         ('Peter',
                          ['male','','peter','','','']),
                         ('Christen',
                          ['','','','','christen','']),
                         ('Jane',
                          ['female','','jane','','','']),
                         ('miss anita',
                          ['female','ms','anita','','','']),
                         ('mr p. christen',
                          ['male','mr','p','','christen','']),
                         ('jones, Peter mary',
                          ['','','peter','mary','jones','']),
                         ('mr jones Peter mary',
                          ['male','mr','peter','mary','jones','']),
                         ('mister jones-miller, Paul PETER',
                          ['male','mr','paul','peter','jones','miller']),

                        ]

    self.name_male_titles = ['mr']
    self.name_female_titles = ['ms']

    src_data_dir = '..'+os.sep+'data'+os.sep+'lookup'+os.sep

    self.name_tag_table = lookup.TagLookupTable(descr='Name tag test table')
    self.name_tag_table.load([src_data_dir+'givenname_f.tbl',
                              src_data_dir+'givenname_m.tbl',
                              src_data_dir+'name_misc.tbl',
                              src_data_dir+'name_prefix.tbl',
                              src_data_dir+'name_title.tbl',
                              src_data_dir+'saints.tbl',
                              src_data_dir+'surname.tbl'])

    self.name_corr_list = lookup.CorrectionList(descr = 'Name corr test list')
    self.name_corr_list.load(src_data_dir+'name_corr.lst')
예제 #3
0
    def setUp(self):

        self.in_ds = dataset.DataSetCSV(
            descri="A standardisation test data set",
            access_mode="read",
            rec_ident="rec_id",
            field_list=[],
            header_line=True,
            write_header=True,
            file_name="test-standard-dataset.csv",
        )

        self.out_ds = dataset.DataSetCSV(
            descrip="The standardised test data set",
            access_mode="write",
            rec_ident="rec_id",
            field_list=[
                ("day", 0),
                ("month", 1),
                ("year", 2),
                ("country_code", 3),
                ("country_name", 4),
                ("area_code", 5),
                ("number", 6),
                ("extension", 7),
                ("title", 8),
                ("gender_guess", 9),
                ("given_name", 10),
                ("alt_given_name", 11),
                ("surname", 12),
                ("alt_surname", 13),
                ("out_pass1", 14),
                ("out_pass2", 15),
            ],
            header_line=True,
            write_header=True,
            file_name="test-standardised-dataset.csv",
        )

        self.dates = [
            ["Sep 1, 68", ["1", "9", "1968"]],
            ["18 Jan 2002", ["18", "1", "2002"]],
            ["17:2:2002", ["17", "2", "2002"]],
            ["2002-02-25", ["25", "2", "2002"]],
            ["18,03,2001", ["18", "3", "2001"]],
            ["21.12.1999", ["21", "12", "1999"]],
            ["February 18,19", ["18", "2", "1919"]],
            ["23\\July\\1968", ["23", "7", "1968"]],
            ["18-02-2002", ["18", "2", "2002"]],
            ["5/03/01", ["5", "3", "2001"]],
            ["19680429", ["29", "4", "1968"]],
            ["600810", ["10", "8", "1960"]],
            ["3:05:2000", ["3", "5", "2000"]],
            ["30.11.1989", ["30", "11", "1989"]],
            ["1. January '70", ["1", "1", "1970"]],
            ["01011970", ["1", "1", "1970"]],
            ["10011970", ["10", "1", "1970"]],
            ["31 dec  1969", ["31", "12", "1969"]],
            ["30 december  69", ["30", "12", "1969"]],
            ["01011970", ["1", "1", "1970"]],
            ["13 Feb 1945", ["13", "2", "1945"]],
            ["Feb 13, '45", ["13", "2", "1945"]],
            ["April 29 1968", ["29", "4", "1968"]],
            ["29-4=68", ["29", "4", "1968"]],
            ["11-01-1972", ["11", "1", "1972"]],
            ["January 10. 1972", ["10", "1", "1972"]],
            ["29 Feb 1932", ["29", "2", "1932"]],
            ["29 Feb 32", ["29", "2", "1932"]],
            ["11 Jun 1902", ["11", "6", "1902"]],
            ["11 Jul 1989", ["11", "7", "1989"]],
            ["12111968", ["12", "11", "1968"]],
            ["      21111969  ", ["21", "11", "1969"]],
        ]

        self.date_parse_formats = [
            "%d %m %Y",  # 24 04 2002  or  24 4 2002
            "%d %B %Y",  # 24 April 2002
            "%d %b %Y",  # 24 Apr 2002
            "%m %d %Y",  # 04 24 2002  or  4 24 2002
            "%B %d %Y",  # April 24 2002
            "%b %d %Y",  # Apr 24 2002
            "%Y %m %d",  # 2002 04 24  or  2002 4 24
            "%Y %B %d",  # 2002 April 24
            "%Y %b %d",  # 2002 Apr 24
            "%d %m %y",  # 24 04 02    or  24 4 02
            "%d %B %y",  # 24 April 02
            "%d %b %y",  # 24 Apr 02
            "%y %m %d",  # 02 04 24    or  02 4 24
            "%y %B %d",  # 02 April 24
            "%y %b %d",  # 02 Apr 24
            "%m %d %y",  # 04 24 02    or  4 24 02
            "%B %d %y",  # April 24 02
            "%b %d %y",
        ]  # Apr 24 02

        self.phonenums = [
            ("++61 2 6125 5690", ["61", "Australia", "02", "6125-5690", ""]),
            ("0061 02 6125 5690", ["61", "Australia", "02", "6125-5690", ""]),
            ("0061   02    6125-5690",
             ["61", "Australia", "02", "6125-5690", ""]),
            ("41 312 17 84", ["41", "Switzerland", "", "312 17 84", ""]),
            ("6125 0010", ["61", "Australia", "", "6125-0010", ""]),
            ("1-800-764-0432", ["1", "USA/Canada", "800", "764-0432", ""]),
            ("02 6125 0010", ["61", "Australia", "02", "6125-0010", ""]),
            ("00 1 317-923 4523", ["1", "USA/Canada", "317", "923-4523", ""]),
            ("1 317-923 4523", ["1", "USA/Canada", "317", "923-4523", ""]),
            ("00111 41 312 17 84", ["41", "Switzerland", "", "312 17 84", ""]),
            ("00001 41 312 17 84", ["41", "Switzerland", "", "312 17 84", ""]),
            ("01 41 312 17 84", ["41", "Switzerland", "", "312 17 84", ""]),
            ("1-541-754-3010", ["1", "USA/Canada", "541", "754-3010", ""]),
            ("754-3010", ["1", "USA/Canada", "", "754-3010", ""]),
            ("754-3010ext 42", ["1", "USA/Canada", "", "754-3010", "42"]),
            ("754-3010x 42", ["1", "USA/Canada", "", "754-3010", "42"]),
            ("754-3010 ext 42", ["1", "USA/Canada", "", "754-3010", "42"]),
            ("754-3010 ext. 42", ["1", "USA/Canada", "", "754-3010", "42"]),
            ("754-3010 x. 42", ["1", "USA/Canada", "", "754-3010", "42"]),
            ("754-3010 x42", ["1", "USA/Canada", "", "754-3010", "42"]),
            ("(541) 754-3010", ["1", "USA/Canada", "541", "754-3010", ""]),
            ("+1-541-754-3010", ["1", "USA/Canada", "541", "754-3010", ""]),
            ("191 541 754 3010", ["", "", "", "915417543010", ""]),
            ("001-541-754-3010", ["1", "USA/Canada", "541", "754-3010", ""]),
            ("636-48018", ["61", "Australia", "", "6364-8018", ""]),
            ("(089) / 636-48018", ["1", "USA/Canada", "896", "364-8018", ""]),
            ("+49-89-636-48018", ["49", "Germany", "", "89 636 48018", ""]),
            ("19-49-89-636-48018", ["", "", "", "9498963648018", ""]),
            ("+61 (02) 6125 0101", ["61", "Australia", "02", "6125-0101", ""]),
            ("++61 (02) 6125 0101", ["61", "Australia", "02", "6125-0101",
                                     ""]),
            ("++61 (2) 6125 0101", ["61", "Australia", "02", "6125-0101", ""]),
            ("11 +61 (2) 6125 0101", ["", "", "", "161261250101", ""]),
            ("0011 ++61 (2) 6125 0101",
             ["61", "Australia", "02", "6125-0101", ""]),
            ("0111 ++61 (2) 6125 0101",
             ["61", "Australia", "02", "6125-0101", ""]),
            ("0111 61 02 6125 0101",
             ["61", "Australia", "02", "6125-0101", ""]),
            ("61 (2) 6125 0101", ["61", "Australia", "02", "6125-0101", ""]),
        ]

        # Names with given names first
        #
        self.names_gnames = [
            ("", ["", "", "", "", "", ""]),
            ("Peter Christen", ["male", "", "peter", "", "christen", ""]),
            ('"DR" Peter Christen',
             ["male", "dr", "peter", "", "christen", ""]),
            ("<mr> Peter Christen",
             ["male", "mr", "peter", "", "christen", ""]),
            ("{ Dr > Peter Christen",
             ["male", "dr", "peter", "", "christen", ""]),
            (' " Dr Peter Christen',
             ["male", "dr", "peter", "", "christen", ""]),
            ("Peter () Christen", ["male", "dr", "peter", "", "christen", ""]),
            ("Peter Christen(DR]]",
             ["male", "dr", "peter", "", "christen", ""]),
            ("Peter Christen (mister",
             ["male", "mr", "peter", "", "christen", ""]),
            ('Peter Christen " mr',
             ["male", "mr", "peter", "", "christen", ""]),
            ("Peter Christen {mr } ",
             ["male", "mr", "peter", "", "christen", ""]),
            ('Peter Christen "dr"',
             ["male", "dr", "peter", "", "christen", ""]),
            (" ( ) Peter Christen", ["male", "", "peter", "", "christen", ""]),
            ('Peter " " Christen', ["male", "", "peter", "", "christen", ""]),
            ("Peter (> Christen", ["male", "", "peter", "", "christen", ""]),
            (",Peter Christen--", ["male", "", "peter", "", "christen", ""]),
            ("-,- Peter Christen-,-",
             ["male", "", "peter", "", "christen", ""]),
            (" //  Peter Christen//",
             ["male", "", "peter", "", "christen", ""]),
            ("(Peter,Christen  )  ", ["male", "", "peter", "", "christen",
                                      ""]),
            ("[Peter   Christen]", ["male", "", "peter", "", "christen", ""]),
            ("<<Peter ,  Christen>>",
             ["male", "", "peter", "", "christen", ""]),
            ("{  Peter Christen }", ["male", "", "peter", "", "christen", ""]),
            ('"Peter Christen"', ["male", "", "peter", "", "christen", ""]),
            ("''Peter ; Christen''", ["male", "", "peter", "", "christen",
                                      ""]),
            ("'|Peter ?: Christen'|",
             ["male", "", "peter", "", "christen", ""]),
            ("Mr peter Christen", ["male", "mr", "peter", "", "christen", ""]),
            ("Mister Peter CHRISTEN",
             ["male", "mr", "peter", "", "christen", ""]),
            ("Petra~ Christen", ["female", "", "petra", "", "christen", ""]),
            ("Ms petra Christen",
             ["female", "ms", "petra", "", "christen", ""]),
            ("Misses Petra CHRISTEN",
             ["female", "ms", "petra", "", "christen", ""]),
            ("Peter Marco Jones", ["male", "", "peter", "mark", "jones", ""]),
            ("peter almond", ["male", "", "peter", "", "almond", ""]),
            ("almond peter", ["male", "", "peter", "", "almond", ""]),
            ("Peter", ["male", "", "peter", "", "", ""]),
            ("alison de francesco", ["", "", "", "", "", ""]),
            ("alison de-francesco", ["", "", "", "", "", ""]),
            ("peter de la placa", ["", "", "", "", "", ""]),
            ("peter marco de la placa", ["", "", "", "", "", ""]),
            ("maria petra de la placa-miller", ["", "", "", "", "", ""]),
            ("maria petra vonder felde", ["", "", "", "", "", ""]),
            ("Christen", ["", "", "", "", "christen", ""]),
            ("Jane", ["female", "", "jane", "", "", ""]),
            ("miss anita", ["female", "ms", "anita", "", "", ""]),
            ("mr p. christen", ["male", "mr", "p", "", "christen", ""]),
            ("Peter mary jones", ["", "", "peter", "mary", "jones", ""]),
            ("mr Peter mary jones",
             ["male", "mr", "peter", "mary", "jones", ""]),
            (
                "mister Paul PETER jones-miller",
                ["male", "mr", "paul", "peter", "jones", "miller"],
            ),
            ("peter known as pete", ["male", "", "peter", "peter", "", ""]),
            ("nee miller", ["", "", "nee", "", "miller", ""]),
            ("peter de nee", ["", "", "peter", "", "de nee", ""]),
            ("paul saint nee", ["", "", "paul", "", "saint nee", ""]),
            ("saint paul nee", ["", "", "saint paul", "", "nee", ""]),
            ("paula miller (nee jones)", ["", "", "", "", "", ""]),
            ("peter, son of nee miller", ["", "", "", "", "", ""]),
            ("peter (known as  pete) christen", ["", "", "", "", "", ""]),
            ('peter (known as "pete") christen', ["", "", "", "", "", ""]),
            ("peter christen miller", ["", "", "", "", "", ""]),
            ("peter christen-miller", ["", "", "", "", "", ""]),
            ("peter joe christen-miller", ["", "", "", "", "", ""]),
            ("peter 'joe' christen-miller", ["", "", "", "", "", ""]),
            ('"sharky" peter miller', ["", "", "", "", "", ""]),
            ("'barbie' sue smith-jones", ["", "", "", "", "", ""]),
            ('sue "barbie" smith meyer', ["", "", "", "", "", ""]),
            ('sue known as "barbie" smith meyer', ["", "", "", "", "", ""]),
            ("sue 'barbie' smith-jones", ["", "", "", "", "", ""]),
            ("sue 'barbie' smith jones", ["", "", "", "", "", ""]),
            ("sue baby of maria jones", ["", "", "", "", "", ""]),
            ("jane co lo-schiavo", ["", "", "", "", "", ""]),
            ("martina louis barber", ["", "", "", "", "", ""]),
            ("lisa-anne hennessy", ["", "", "", "", "", ""]),
            ("michelle southam-byrnes", ["", "", "", "", "", ""]),
            ("nicole win jordan", ["", "", "", "", "", ""]),
            ("caroline and clarke", ["", "", "", "", "", ""]),
            ("jocelyn or buskens", ["", "", "", "", "", ""]),
            ("yee fung nee cheng", ["", "", "", "", "", ""]),
            ("jenny khaw nee yii", ["", "", "", "", "", ""]),
            ("roslyn kay sta maria", ["", "", "", "", "", ""]),
            ("shelley lee di stefano", ["", "", "", "", "", ""]),
            ("li qing van huisstede", ["", "", "", "", "", ""]),
            ("patricia ann van den hurk", ["", "", "", "", "", ""]),
            ("kim maree nguyen su", ["", "", "", "", "", ""]),
            ("adriana haile de lange", ["", "", "", "", "", ""]),
            ("jodene akke op't land", ["", "", "", "", "", ""]),
            ("cleo ann di blasio", ["", "", "", "", "", ""]),
            ("debbie saphire st quintin", ["", "", "", "", "", ""]),
            ("nehmat e el chaar", ["", "", "", "", "", ""]),
            ("yan chen ping yang", ["", "", "", "", "", ""]),
            ("sharon leoni van ant werpen", ["", "", "", "", "", ""]),
            ("nicole maria de oliveira", ["", "", "", "", "", ""]),
            ("sonia denni de arman", ["", "", "", "", "", ""]),
            ("nicole dan de arman", ["", "", "", "", "", ""]),
            ("johdy louise dal santo", ["", "", "", "", "", ""]),
            ("tamara lou st. john-morton", ["", "", "", "", "", ""]),
            ("mercy jacq john peter", ["", "", "", "", "", ""]),
            ("carly evelyn de st germain", ["", "", "", "", "", ""]),
            ("rachael jane van buuren", ["", "", "", "", "", ""]),
            ("joanna lilli van ryswyk", ["", "", "", "", "", ""]),
            ("melissa ma romijn-van stey", ["", "", "", "", "", ""]),
            ("wong jing ling huang", ["", "", "", "", "", ""]),
            ("julie  maree mackenzie - hun", ["", "", "", "", "", ""]),
            ("joanne agnes righettli (dr)", ["", "dr", "", "", "", ""]),
            ("siu har ng (hung)", ["", "", "", "", "", ""]),
            ("anne-maree lawrence-franks", ["", "", "", "", "", ""]),
            ("mao-yao rong-fong", ["", "", "", "", "", ""]),
            ("wai-fun wheeler-smith", ["", "", "", "", "", ""]),
            ("lee-anne westerbrook-sim", ["", "", "", "", "", ""]),
            ("kasey-lee so-chan", ["", "", "", "", "", ""]),
            ("sherri-anne hilder-penningt", ["", "", "", "", "", ""]),
            ("yoon-sun ahn-wu", ["", "", "", "", "", ""]),
            ("ying-xia yu-guo", ["", "", "", "", "", ""]),
            ("hee-jing hyde-page", ["", "", "", "", "", ""]),
            ("mary-anne chung-kwon", ["", "", "", "", "", ""]),
            ("marie-reine attallah-boulos", ["", "", "", "", "", ""]),
            ("tracy-lea zanco-hinds", ["", "", "", "", "", ""]),
            ("tracy-maria beardow-brooks", ["", "", "", "", "", ""]),
            ("el-masri sheehan-hill", ["", "", "", "", "", ""]),
            ("vicki-maree cheryle-anne", ["", "", "", "", "", ""]),
            ("vicki-mare sheehan-anna", ["", "", "", "", "", ""]),
            ("cindy-lou mckie-bailey", ["", "", "", "", "", ""]),
            ("jo-ann bakoss-parson", ["", "", "", "", "", ""]),
            ("wan-ching tsui-chan", ["", "", "", "", "", ""]),
            ("sue-ellen bruechert-reich", ["", "", "", "", "", ""]),
            ("anna-marie vearing-brown", ["", "", "", "", "", ""]),
            ("lisa-jane o'connor", ["", "", "", "", "", ""]),
            ("julie-anne o'malley", ["", "", "", "", "", ""]),
            ("mary-jane o'doherty", ["", "", "", "", "", ""]),
            ("jose-carol o'leary", ["", "", "", "", "", ""]),
            ("rose-merrie o'kane", ["", "", "", "", "", ""]),
            ("ymeka-emily o'neill", ["", "", "", "", "", ""]),
        ]

        ## check field spill - have 2 input fields

        # Names with surnames first
        #
        self.names_snames = [
            ("Christen Peter", ["male", "", "peter", "", "christen", ""]),
            ("Christen, Peter", ["male", "", "peter", "", "christen", ""]),
            ("Mr Christen Peter", ["male", "mr", "peter", "", "christen", ""]),
            ("Mister CHRISTEN, Peter",
             ["male", "mr", "peter", "", "christen", ""]),
            ("Christen Petra", ["female", "", "petra", "", "christen", ""]),
            ("Ms Christen, petra",
             ["female", "ms", "petra", "", "christen", ""]),
            ("Misses CHRISTEN, PETRA",
             ["female", "ms", "petra", "", "christen", ""]),
            ("peter almond", ["male", "", "peter", "", "almond", ""]),
            ("almond peter", ["male", "", "peter", "", "almond", ""]),
            ("", ["", "", "", "", "", ""]),
            ("Peter", ["male", "", "peter", "", "", ""]),
            ("Christen", ["", "", "", "", "christen", ""]),
            ("Jane", ["female", "", "jane", "", "", ""]),
            ("miss anita", ["female", "ms", "anita", "", "", ""]),
            ("mr p. christen", ["male", "mr", "p", "", "christen", ""]),
            ("jones, Peter mary", ["", "", "peter", "mary", "jones", ""]),
            ("mr jones Peter mary",
             ["male", "mr", "peter", "mary", "jones", ""]),
            (
                "mister jones-miller, Paul PETER",
                ["male", "mr", "paul", "peter", "jones", "miller"],
            ),
        ]

        self.name_male_titles = ["mr"]
        self.name_female_titles = ["ms"]

        src_data_dir = ".." + os.sep + "data" + os.sep + "lookup" + os.sep

        self.name_tag_table = lookup.TagLookupTable(
            descr="Name tag test table")
        self.name_tag_table.load([
            src_data_dir + "givenname_f.tbl",
            src_data_dir + "givenname_m.tbl",
            src_data_dir + "name_misc.tbl",
            src_data_dir + "name_prefix.tbl",
            src_data_dir + "name_title.tbl",
            src_data_dir + "saints.tbl",
            src_data_dir + "surname.tbl",
        ])

        self.name_corr_list = lookup.CorrectionList(
            descr="Name corr test list")
        self.name_corr_list.load(src_data_dir + "name_corr.lst")