def test_init(self): naam = Name( prepositie=None, voornaam=None, intrapositie='van het', geslachtsnaam='Reve', postpositie=None, volledige_naam='Gerard van het Reve', ) self.assertEqual(naam.volledige_naam(), 'Gerard van het Reve') self.assertEqual(naam.geslachtsnaam(), 'Reve') naam = Name( prepositie='dhr.', voornaam='Gerard', intrapositie='van het', geslachtsnaam='Reve', postpositie='schrijver', volledige_naam='dhr. Gerard van het Reve, schrijver', ) self.assertEqual(naam.prepositie(), 'dhr.') self.assertEqual(naam.voornaam(), 'Gerard') self.assertEqual(naam.intrapositie(), 'van het') self.assertEqual(naam.geslachtsnaam(), 'Reve') self.assertEqual(naam.postpositie(), 'schrijver') self.assertEqual(naam.geslachtsnaam(), 'Reve')
def test_html2unicode(self): s = 'Mötörhead' n = Name(s) self.assertEqual(n.volledige_naam(), 'Mötörhead') #this shoudl not be here, but under a separate test for the utility functions in common self.assertEqual(html2unicode('é'), 'é') self.assertEqual(html2unicode('São'), 'São')
def test_html2unicode(self): s = u'Mötörhead' n = Name(s) self.assertEqual(n.volledige_naam(), u'Mötörhead') #this shoudl not be here, but under a separate test for the utility functions in common self.assertEqual(html2unicode('é'), u'é') self.assertEqual(html2unicode('São'), u'São')
def test_initials(self): self.assertEqual(Name('P. Gerbrandy').initials(), 'PG') self.assertEqual(Name('Engelmann, Th.W.').initials(), 'TWE') self.assertEqual( Name('Borret, Prof. Dr. Theodoor Joseph Hubert').initials(), 'TJHB') self.assertEqual( Name('Hoeven, Abraham des Amorie van der (1)').initials(), 'AAH')
def test_serialize(self): s = '<a>a<b>b</b> c</a>' self.assertEqual(Name().serialize(etree.fromstring(s)), 'ab c') #@UndefinedVariable s ='<persName>Jelle <name type="geslachtsnaam">Gerbrandy</name></persName>' naam = Name().from_string(s) self.assertEqual(serialize(naam._root), 'Jelle Gerbrandy') self.assertEqual(naam.serialize(naam._root), 'Jelle Gerbrandy') self.assertEqual(naam.serialize(), 'Jelle Gerbrandy') self.assertEqual(naam.serialize(exclude='geslachtsnaam'), 'Jelle')
def test_idempotence(self): #calling the guessing functions more than one time should not make any difference name = Name('Jelle Gerbrandy') name.guess_normal_form() xml1 = name.to_string() name.guess_normal_form() xml2 = name.to_string() name.guess_geslachtsnaam() xml3 = name.to_string() self.assertEqual(xml1, xml2) self.assertEqual(xml1, xml3)
def test_soundex_nl1(self): self.assertEqual(soundex_nl1('Scholten', length=5), 'sg.lt') n1 = Name('Uyl') n2 = Name('Uijl') n3 = Name('Uil') n4 = Name('Yl') self.assertEqual(n1.soundex_nl(length=5), ['.l']) self.assertEqual(n2.soundex_nl(length=5), ['.l']) self.assertEqual(n3.soundex_nl(length=5), ['.l']) self.assertEqual(n4.soundex_nl(length=5), ['.l']) self.assertEqual(Name('AAA').soundex_nl(), ['.']) self.assertEqual(Name('Quade').soundex_nl(), ['k.t']) self.assertEqual(Name('Quack').soundex_nl(), ['k.k']) self.assertEqual(Name('kwak').soundex_nl(), ['k.k']) self.assertEqual(Name('kwik en kwak').soundex_nl(), ['k.k', ]) self.assertEqual(Name('rhood').soundex_nl(), ['r.t']) self.assertEqual(Name('zutphen').soundex_nl(), ['s.tf']) self.assertEqual(Name('Willem').soundex_nl(), ['f.l.']) #diacritics? self.assertEqual(soundex_nl1(u'wél'), 'f.l') self.assertEqual(soundex_nl1(u'bosma'), soundex_nl1(u'boschma')) self.assertEqual(Name('Pius IX').soundex_nl(), ['p.s', ]) for ls in SAME_SOUNDEX: n1 = ls[0] s1 = soundex_nl1(n1) for n2 in ls[1:]: s2 = soundex_nl1(n2) self.assertEqual(s1, s2, '%s=>%s ::: %s=>%s' %( n1, s1, n2, s2))
def test_soundex_nl(self): s ='<persName>Jelle <name type="geslachtsnaam">Gerbrandy</name></persName>' n = Name().from_string(s) self.assertEqual(set(n.soundex_nl(length=5)), set(['g.rpr', 'j.l'])) s ='<persName>Jelle <name type="geslachtsnaam">Scholten</name></persName>' #now that we have computed the soundex_nl, its value should be cached n = Name().from_string(s) self.assertEqual(n.soundex_nl(length=5), ['sg.lt', 'j.l']) self.assertEqual(set(Name('janssen, hendrik').soundex_nl(group=1)), set(['j.ns', '.tr.'])) self.assertEqual(Name('aearssen-walte, lucia van').soundex_nl(group=1), Name('aearssen,walte, lucia van').soundex_nl(group=1)) # self.assertEqual(Name('Jhr. Mr. K').soundex_nl(), ['k']) self.assertEqual(set(Name('janssen, hendrik').soundex_geslachtsnaam()), set([u'j.ns']))
def test_volledige_naam(self): n = Name(voornaam='Jelle') self.assertEqual(n.get_volledige_naam(),'Jelle') n.guess_geslachtsnaam() self.assertEqual(n.get_volledige_naam(),'Jelle') n = Name().from_string("""<persName> <name type="voornaam">Hendrik IV</name> </persName>""") self.assertEqual(n.get_volledige_naam(), 'Hendrik IV') naam = Name(voornaam='Hendrik IV') self.assertEqual(naam.get_volledige_naam(), u'Hendrik IV')
def test_soundex_nl(self): s = '<persName>Jelle <name type="geslachtsnaam">Gerbrandy</name></persName>' n = Name().from_string(s) self.assertEqual(set(n.soundex_nl(length=5)), set(['g.rpr', 'j.l'])) s = '<persName>Jelle <name type="geslachtsnaam">Scholten</name></persName>' #now that we have computed the soundex_nl, its value should be cached n = Name().from_string(s) self.assertEqual(n.soundex_nl(length=5), ['sg.lt', 'j.l']) self.assertEqual(set(Name('janssen, hendrik').soundex_nl(group=1)), set(['j.ns', '.tr.'])) self.assertEqual( Name('aearssen-walte, lucia van').soundex_nl(group=1), Name('aearssen,walte, lucia van').soundex_nl(group=1)) # self.assertEqual(Name('Jhr. Mr. K').soundex_nl(), ['k']) self.assertEqual(set(Name('janssen, hendrik').soundex_geslachtsnaam()), set(['j.ns']))
def test_insert_consituent(self): s1 ='<persName>Hugo <name type="intrapositie">de</name> <name type="geslachtsnaam">Groot</name></persName>' s2 ='<persName><name type="voornaam">Hugo</name> <name type="intrapositie">de</name> <name type="geslachtsnaam">Groot</name></persName>' s3 ='<persName>Hugo de Groot</persName>' s4 ='<persName>Hugo <name type="intrapositie">de</name> Groot</persName>' name = Name().from_string(s1) text = name.to_xml().text #check sanity self.assertEqual(text,'Hugo ') m = re.match('Hugo', text) name._insert_constituent('voornaam', m) self.assertEqual(name.to_string(), s2) name = Name().from_string(s3) text = name.to_xml().text m = re.search('de', text) name._insert_constituent('intrapositie', m) self.assertEqual(name.to_string(), s4)
def test_guess_normal_form(self): self.assertEqual(Name('Arien A').guess_normal_form(), 'A, Arien'), for n, wanted_result in [ (Name().from_args(geslachtsnaam='A', volledige_naam='Arien A'), 'A, Arien'), (Name('Brugse Meester van 1493'), 'Brugse Meester van 1493'), (Name('Th.W. Engelmann'), 'Engelmann, Th.W.'), (Name('A. Algra'), 'Algra, A.'), # (Name().from_string('<persName>A. Algra</persName>'), 'Algra A.') (Name('(G. Morton)'), 'Morton, G.'), (Name('Di(e)ck, Jan Gerard'), 'Dick, Jan Gerard'), (Name('Arien A'), 'A, Arien'), (Name('David Heilbron Cz.'), 'Heilbron Cz., David'), (Name('Johann (Johan) VII'), 'Johann VII' ), (Name('Johann VII'), 'Johann VII' ), # (Name('koning Willem III') , 'Willem III' ), (Name(u'Crato, graaf van Nassau-Saarbrück'), u'Crato, graaf van Nassau-Saarbrück'), (Name(u'Wilhelmina van Pruisen - prinses van Oranje-Nassau'), 'Wilhelmina van Pruisen - prinses van Oranje-Nassau'), (Name(u'Henriette Adriana Louise Flora d\'Oultremont de Wégimont'), u"d'Oultremont de Wégimont, Henriette Adriana Louise Flora"), (Name(u'Wolrat, vorst van Nassau-Usingen dikwijls genoemd Nassau-Saarbrück'), u'Wolrat, vorst van Nassau-Usingen dikwijls genoemd Nassau-Saarbrück'), (Name(u'van \'s-Gravezande, Arnoldus Corneluszn. Storm'), 's-Gravezande, Arnoldus Corneluszn. Storm, van'), (Name('L.T. graaf van Nassau La Lecq'), 'L.T. graaf van Nassau La Lecq'), (Name(u'Géo d\'Aconit'), u'd\'Aconit, Géo'), (Name(u'J. Heemskerk Azn.'), u'Heemskerk Azn., J.'), ]: guessed = n.guess_normal_form() self.assertEqual(guessed, wanted_result) self.assertEqual(Name('A').guess_normal_form(), 'A'), self.assertEqual(Name('Hendrik ten Brink Hz.').guess_normal_form(), 'Brink Hz., Hendrik ten'), n1 = etree.fromstring('<persName>Kees van Dongen</persName>') #@UndefinedVariable n1 = Name().from_xml(n1) self.assertEqual(n1.guess_geslachtsnaam(), 'Dongen') self.assertEqual(n1.guess_normal_form(), 'Dongen, Kees van') n1 = etree.fromstring('<persName>Dongen, Kees van</persName>') #@UndefinedVariable n1 = Name().from_xml(n1) self.assertEqual(n1.guess_normal_form(), 'Dongen, Kees van')
def test_from_args(self): n = Name().from_args(volledige_naam='Jelle Gerbrandy', geslachtsnaam='Gerbrandy') s ='<persName>Jelle <name type="geslachtsnaam">Gerbrandy</name></persName>' self.assertEqual(n.to_string(), s) n = Name().from_args(volledige_naam='Jelle Gerbrandy', geslachtsnaam='Gerbrandy') s ='<persName>Jelle <name type="geslachtsnaam">Gerbrandy</name></persName>' self.assertEqual(n.to_string(), s) n = Name(geslachtsnaam='Gerbrandy', voornaam='Jelle', intrapositie=None) s = '<persName><name type="voornaam">Jelle</name> <name type="geslachtsnaam">Gerbrandy</name></persName>' self.assertEqual(n.to_string(), s) n = Name().from_args(volledige_naam='Arien A', geslachtsnaam='A') s ='<persName>Arien <name type="geslachtsnaam">A</name></persName>' self.assertEqual(n.to_string(), s)
def test_equal(self): n1 = Name('Kees van Dongen') n2 = Name('Dongen, Kees van') self.assertEqual(ratio(n1, n2), 1.0) n1 = Name('Mercier, Camier') n2 = Name('Camier Mercier') self.assertEqual(ratio(n1, n2), 1.0) n1 = etree.fromstring('<persName>Kees van Dongen</persName>') #@UndefinedVariable n1 = Name().from_xml(n1) n2 = etree.fromstring('<persName>Dongen, Kees van</persName>') #@UndefinedVariable n2 = Name().from_xml(n2) self.assertEqual(n1.guess_normal_form(), n2.guess_normal_form()) self.assertEqual(ratio(n1, n2), 1.0) n3 = etree.fromstring('<persName>Kees van Dongen</persName>') #@UndefinedVariable n3 = Name().from_xml(n3, store_guessed_geslachtsnaam=False) self.assertEqual(ratio(n1, n3), 1.0) self.assertEqual(ratio(n2, n3), 1.0) n1 = Name('Witte van Citters, Jacob de (jhr. mr.)') n2 = Name('Jacob de Witte van Citters') # print ratio(n1, n2, explain=True) self.assertEqual(ratio(n1, n2), 1.0)
def test_serialize(self): s = '<a>a<b>b</b> c</a>' self.assertEqual(Name().serialize(etree.fromstring(s)), 'ab c') #@UndefinedVariable s = '<persName>Jelle <name type="geslachtsnaam">Gerbrandy</name></persName>' naam = Name().from_string(s) self.assertEqual(serialize(naam._root), 'Jelle Gerbrandy') self.assertEqual(naam.serialize(naam._root), 'Jelle Gerbrandy') self.assertEqual(naam.serialize(), 'Jelle Gerbrandy') self.assertEqual(naam.serialize(exclude='geslachtsnaam'), 'Jelle')
def test_guess_geslachtsnaam(self): for n, wanted_result in [ ('Jelle Gerbrandy', 'Gerbrandy'), # ('Boudewijn (zie van der AA.)', 'Boudewijn'), ('Gerbrandy, Jelle', 'Gerbrandy'), ('C.H.Veenstra', 'Veenstra'), ('Yvette Marcus-de Groot', 'Marcus-de Groot'), ('S. de Groot', 'Groot'), ('Willy Smit-Buit', 'Smit-Buit'), ('Hendrik', 'Hendrik'), ('Bec(q)-Crespin, Josina du', 'Bec(q)-Crespin'), ('David Heilbron Cz.', 'Heilbron'), ('Arien A', 'A'), ('Johannes de Heer', 'Heer'), ('Bonnet-Broederhart. A.G.', 'Bonnet-Broederhart.'), ('Th.W. Engelmann', 'Engelmann'), ('A Algra', 'Algra'), # ('Auger O' , 'Auger' ), ]: guessed = Name(n).guess_geslachtsnaam() self.assertEqual(guessed, wanted_result)
def test_guess_constituents(self): #name of the form family_name, given_name s1 = '<persName><name type="geslachtsnaam">Beckett</name>, <name type="voornaam">Samuel</name></persName>' s2 = 'Beckett, Samuel' self.assertEqual(etree.tostring(Name(s2)._guess_constituents()), s1) #@UndefinedVariable #test round trip self.assertEqual( etree.tostring(Name().from_string(s1)._guess_constituents()), s1) #@UndefinedVariable #a simple normal name s1 = '<persName><name type="voornaam">Samuel</name> <name type="geslachtsnaam">Beckett</name></persName>' s2 = 'Samuel Beckett' self.assertEqual(etree.tostring(Name(s2)._guess_constituents()), s1) #@UndefinedVariable #test round trip self.assertEqual( etree.tostring(Name().from_string(s1)._guess_constituents()), s1) #@UndefinedVariable #intrapositions s1 = 'Hugo de Groot' s2 = '<persName><name type="voornaam">Hugo</name> <name type="intrapositie">de</name> <name type="geslachtsnaam">Groot</name></persName>' self.assertEqual(etree.tostring(Name(s1)._guess_constituents()), s2) #@UndefinedVariable s1 = 'Marie Bakker-de Groot' s2 = '<persName><name type="voornaam">Marie</name> <name type="geslachtsnaam">Bakker-de Groot</name></persName>' self.assertEqual(etree.tostring(Name(s1)._guess_constituents()), s2) #@UndefinedVariable s1 = 'Arien A' s2 = '<persName><name type="voornaam">Arien</name> <name type="geslachtsnaam">A</name></persName>' self.assertEqual(etree.tostring(Name(s1)._guess_constituents()), s2) #@UndefinedVariable
def test_html_codes(self): n = Name('Wél?') n.html2unicode() self.assertEqual(n.volledige_naam(), 'Wél?')
def test_normal_form(self): s ='<persName>Jelle <name type="geslachtsnaam">Gerbrandy</name></persName>' naam = Name().from_string(s) self.assertEqual(naam.geslachtsnaam(), u'Gerbrandy') self.assertEqual(naam.guess_normal_form(), u'Gerbrandy, Jelle') self.assertEqual(naam.guess_normal_form2(), u'Jelle Gerbrandy') naam = Name('Jelle Gerbrandy') self.assertEqual(naam.guess_normal_form(), u'Gerbrandy, Jelle') naam.guess_geslachtsnaam() self.assertEqual(naam.guess_normal_form2(), u'Jelle Gerbrandy') naam = Name('Gerbrandy, Jelle') self.assertEqual(naam.guess_normal_form(), u'Gerbrandy, Jelle') self.assertEqual(naam.guess_normal_form2(), u'Jelle Gerbrandy') naam = Name(voornaam='Hendrik IV') self.assertEqual(naam.geslachtsnaam(), '') self.assertEqual(naam.guess_normal_form(), u'Hendrik IV') self.assertEqual(naam.guess_normal_form2(), u'Hendrik IV') n = Name().from_string("""<persName> <name type="voornaam">Hendrik IV</name> </persName>""") n.guess_geslachtsnaam() assert not n.geslachtsnaam(), n.to_string() self.assertEqual(n.guess_normal_form(), 'Hendrik IV') self.assertEqual(naam.guess_normal_form2(), u'Hendrik IV') s = """<persName> <name type="geslachtsnaam">Xerxes</name> </persName>""" n = Name().from_string(s) self.assertEqual(n.guess_normal_form(), 'Xerxes') s = '<persName><name type="geslachtsnaam">A</name>, Arien</persName>' n = Name().from_string(s) self.assertEqual(n.guess_normal_form(), 'A, Arien') self.assertEqual(n.guess_normal_form2(), 'Arien A') n = Name('A.B.J.Teulings') self.assertEqual(n.guess_normal_form(), 'Teulings, A.B.J.') self.assertEqual(n.guess_normal_form2(), 'A.B.J.Teulings') naam = Name('JOHAN (Johann) VII') self.assertEqual(naam.guess_normal_form(), 'Johan VII') naam = Name().from_string('<persName><name type="geslachtsnaam">Dirk</name>, VI, Theodericus</persName>') self.assertEqual(naam.guess_normal_form(), 'Dirk, VI, Theodericus') naam = Name('Lodewijk XVIII') self.assertEqual(naam.guess_normal_form2(), 'Lodewijk XVIII') s = """<persName> <name type="voornaam">Trijn</name> <name type="intrapositie">van</name> <name type="geslachtsnaam">Leemput</name></persName>""" naam = Name().from_string(s) self.assertEqual(naam.guess_normal_form(), 'Leemput, Trijn van') self.assertEqual(naam.guess_normal_form2(), 'Trijn van Leemput') n5 = Name('Piet Gerbrandy', geslachtsnaam='Gerbrandy') self.assertEqual(n5.guess_normal_form(), 'Gerbrandy, Piet') self.assertEqual(n5.guess_normal_form2(), 'Piet Gerbrandy') # n6 = Name('Piet Gerbrandy', geslachtsnaam='Piet') # n6._tokenize() # self.assertEqual(n6.guess_normal_form(), 'Piet Gerbrandy') # self.assertEqual(n6.guess_normal_form2(), 'Gerbrandy Piet') n = Name('Hermansz') self.assertEqual(n.guess_normal_form(), 'Hermansz') self.assertEqual(n.geslachtsnaam(), 'Hermansz') n = Name('Ada, van Holland (1)') self.assertEqual(n.guess_normal_form(), 'Ada, van Holland') n = Name('Hees - B.P. van') self.assertEqual(n.guess_normal_form(), 'Hees - B.P. van') n = Name('Hees - B.P. van (1234-1235)') self.assertEqual(n.guess_normal_form(), 'Hees - B.P. van') n = Name('Hoeven, Abraham des Amorie van der (1)') self.assertEqual(n.guess_normal_form(), 'Hoeven, Abraham des Amorie van der') self.assertEqual(n.guess_normal_form2(), 'Abraham des Amorie van der Hoeven') n = Name('Schepper, Gerhard Antoni IJssel de') self.assertEqual(n.guess_normal_form(), 'Schepper, Gerhard Antoni IJssel de')
def test_normal_form(self): s = '<persName>Jelle <name type="geslachtsnaam">Gerbrandy</name></persName>' naam = Name().from_string(s) self.assertEqual(naam.geslachtsnaam(), 'Gerbrandy') self.assertEqual(naam.guess_normal_form(), 'Gerbrandy, Jelle') self.assertEqual(naam.guess_normal_form2(), 'Jelle Gerbrandy') naam = Name('Jelle Gerbrandy') self.assertEqual(naam.guess_normal_form(), 'Gerbrandy, Jelle') naam.guess_geslachtsnaam() self.assertEqual(naam.guess_normal_form2(), 'Jelle Gerbrandy') naam = Name('Gerbrandy, Jelle') self.assertEqual(naam.guess_normal_form(), 'Gerbrandy, Jelle') self.assertEqual(naam.guess_normal_form2(), 'Jelle Gerbrandy') naam = Name(voornaam='Hendrik IV') self.assertEqual(naam.geslachtsnaam(), '') self.assertEqual(naam.guess_normal_form(), 'Hendrik IV') self.assertEqual(naam.guess_normal_form2(), 'Hendrik IV') n = Name().from_string("""<persName> <name type="voornaam">Hendrik IV</name> </persName>""") n.guess_geslachtsnaam() assert not n.geslachtsnaam(), n.to_string() self.assertEqual(n.guess_normal_form(), 'Hendrik IV') self.assertEqual(naam.guess_normal_form2(), 'Hendrik IV') s = """<persName> <name type="geslachtsnaam">Xerxes</name> </persName>""" n = Name().from_string(s) self.assertEqual(n.guess_normal_form(), 'Xerxes') s = '<persName><name type="geslachtsnaam">A</name>, Arien</persName>' n = Name().from_string(s) self.assertEqual(n.guess_normal_form(), 'A, Arien') self.assertEqual(n.guess_normal_form2(), 'Arien A') n = Name('A.B.J.Teulings') self.assertEqual(n.guess_normal_form(), 'Teulings, A.B.J.') self.assertEqual(n.guess_normal_form2(), 'A.B.J.Teulings') naam = Name('JOHAN (Johann) VII') self.assertEqual(naam.guess_normal_form(), 'Johan VII') naam = Name().from_string( '<persName><name type="geslachtsnaam">Dirk</name>, VI, Theodericus</persName>' ) self.assertEqual(naam.guess_normal_form(), 'Dirk, VI, Theodericus') naam = Name('Lodewijk XVIII') self.assertEqual(naam.guess_normal_form2(), 'Lodewijk XVIII') s = """<persName> <name type="voornaam">Trijn</name> <name type="intrapositie">van</name> <name type="geslachtsnaam">Leemput</name></persName>""" naam = Name().from_string(s) self.assertEqual(naam.guess_normal_form(), 'Leemput, Trijn van') self.assertEqual(naam.guess_normal_form2(), 'Trijn van Leemput') n5 = Name('Piet Gerbrandy', geslachtsnaam='Gerbrandy') self.assertEqual(n5.guess_normal_form(), 'Gerbrandy, Piet') self.assertEqual(n5.guess_normal_form2(), 'Piet Gerbrandy') # n6 = Name('Piet Gerbrandy', geslachtsnaam='Piet') # n6._tokenize() # self.assertEqual(n6.guess_normal_form(), 'Piet Gerbrandy') # self.assertEqual(n6.guess_normal_form2(), 'Gerbrandy Piet') n = Name('Hermansz') self.assertEqual(n.guess_normal_form(), 'Hermansz') self.assertEqual(n.geslachtsnaam(), 'Hermansz') n = Name('Ada, van Holland (1)') self.assertEqual(n.guess_normal_form(), 'Ada, van Holland') n = Name('Hees - B.P. van') self.assertEqual(n.guess_normal_form(), 'Hees - B.P. van') n = Name('Hees - B.P. van (1234-1235)') self.assertEqual(n.guess_normal_form(), 'Hees - B.P. van') n = Name('Hoeven, Abraham des Amorie van der (1)') self.assertEqual(n.guess_normal_form(), 'Hoeven, Abraham des Amorie van der') self.assertEqual(n.guess_normal_form2(), 'Abraham des Amorie van der Hoeven') n = Name('Schepper, Gerhard Antoni IJssel de') self.assertEqual(n.guess_normal_form(), 'Schepper, Gerhard Antoni IJssel de')
def test_spaces_in_xml(self): n = Name(voornaam='Jelle', geslachtsnaam='Gerbrandy') s = '<persName><name type="voornaam">Jelle</name> <name type="geslachtsnaam">Gerbrandy</name></persName>' self.assertEqual(n.to_string(), s)
def test_from_xml(self): s = '<persName>Jelle <name type="geslachtsnaam">Gerbrandy</name></persName>' n = Name().from_string(s) # assert 0, etree.fromstring(s).xpath('//name[@type="geslachtsnaam"]') self.assertEqual(n.geslachtsnaam(), 'Gerbrandy') self.assertEqual(n.to_string(), s)
def test_from_args(self): n = Name().from_args(volledige_naam='Jelle Gerbrandy', geslachtsnaam='Gerbrandy') s = '<persName>Jelle <name type="geslachtsnaam">Gerbrandy</name></persName>' self.assertEqual(n.to_string(), s) n = Name().from_args(volledige_naam='Jelle Gerbrandy', geslachtsnaam='Gerbrandy') s = '<persName>Jelle <name type="geslachtsnaam">Gerbrandy</name></persName>' self.assertEqual(n.to_string(), s) n = Name(geslachtsnaam='Gerbrandy', voornaam='Jelle', intrapositie=None) s = '<persName><name type="voornaam">Jelle</name> <name type="geslachtsnaam">Gerbrandy</name></persName>' self.assertEqual(n.to_string(), s) n = Name().from_args(volledige_naam='Arien A', geslachtsnaam='A') s = '<persName>Arien <name type="geslachtsnaam">A</name></persName>' self.assertEqual(n.to_string(), s)
def test_diacritics(self): n = Name('Wét').store_guessed_geslachtsnaam() el = etree.Element('test') #@UndefinedVariable el.text = 'Wét' s = '<persName><name type="geslachtsnaam">W\xe9t</name></persName>' self.assertEqual(n.to_string(), s)
def test_to_string(self): self.assertEqual( Name('abc').store_guessed_geslachtsnaam().to_string(), '<persName><name type="geslachtsnaam">abc</name></persName>')
def test_html_codes(self): n = Name('Wél?') n.html2unicode() self.assertEqual( n.volledige_naam(), u'Wél?')
def test_contains_initials(self): self.assertEqual(Name('J.K. Rowling').guess_geslachtsnaam(), 'Rowling') self.assertEqual(Name('J.K. Rowling').contains_initials(), True) self.assertEqual(Name('Th.D. de Rowling').contains_initials(), True) self.assertEqual(Name('Rowling, Jan').contains_initials(), False) self.assertEqual(Name('Rowling, J.').contains_initials(), True)
def test_geslachtsnaam_guess(self): problematic_names = ['abc. DE. F;dk. Genoeg-Van'] for namestr in problematic_names: name = Name(namestr) should_be = re.sub('<[^>]+>', '', name.to_string()) self.assertEqual(namestr, should_be)
def test_sort_key(self): s ='<persName>Jelle <name type="geslachtsnaam">Gerbrandy</name></persName>' n = Name().from_string(s) self.assertEqual(n.sort_key()[:15], 'gerbrandy jelle') s ='<persName>Jelle <name type="geslachtsnaam">Éerbrandy</name></persName>' n = Name().from_string(s) self.assertEqual(n.sort_key()[:15], 'eerbrandy jelle') n = Name(u'São Paolo') self.assertEqual(n.geslachtsnaam(), 'Paolo') # Automatically guessed self.assertEqual(n.sort_key().split()[0], 'paolo') n = Name('(Hans) Christian') self.assertEqual(n.sort_key().split()[0], 'christian') n =Name(u'Løwencron') self.assertEqual(n.sort_key().split()[0], 'loewencron') n = Name(u'?, Pietje') self.assertTrue(n.sort_key() > 'a', n.sort_key()) n = Name("L'Hermite") self.assertTrue(n.sort_key().startswith('herm')) n = Name("La Hermite") self.assertTrue(n.sort_key().startswith('herm')), n.sort_key() n = Name(u'Löwel') self.assertTrue(n.sort_key().startswith('lo')), n.sort_key() n = Name("1'Aubepine, Charles de") #this name starts with the numeral "1" self.assertTrue(n.sort_key().startswith('au')), n.sort_key() n = Name(u'Géo d\'Aconit') self.assertTrue(n.sort_key().startswith('aco')) s ='<persName>Samuel <name type="geslachtsnaam">Beckett</name></persName>' n1 = Name().from_string(s) s ='<persName>Beckett, Samuel</persName>' n2 = Name().from_string(s) self.assertEqual(n1.sort_key(), n2.sort_key())
def test_sort_key(self): s = '<persName>Jelle <name type="geslachtsnaam">Gerbrandy</name></persName>' n = Name().from_string(s) self.assertEqual(n.sort_key()[:15], 'gerbrandy jelle') s = '<persName>Jelle <name type="geslachtsnaam">Éerbrandy</name></persName>' n = Name().from_string(s) self.assertEqual(n.sort_key()[:15], 'eerbrandy jelle') n = Name('São Paolo') self.assertEqual(n.geslachtsnaam(), 'Paolo') # Automatically guessed self.assertEqual(n.sort_key().split()[0], 'paolo') n = Name('(Hans) Christian') self.assertEqual(n.sort_key().split()[0], 'christian') n = Name('Løwencron') self.assertEqual(n.sort_key().split()[0], 'loewencron') n = Name('?, Pietje') self.assertTrue(n.sort_key() > 'a', n.sort_key()) n = Name("L'Hermite") self.assertTrue(n.sort_key().startswith('herm')) n = Name("La Hermite") self.assertTrue(n.sort_key().startswith('herm')), n.sort_key() n = Name('Löwel') self.assertTrue(n.sort_key().startswith('lo')), n.sort_key() n = Name( "1'Aubepine, Charles de") #this name starts with the numeral "1" self.assertTrue(n.sort_key().startswith('au')), n.sort_key() n = Name('Géo d\'Aconit') self.assertTrue(n.sort_key().startswith('aco')) s = '<persName>Samuel <name type="geslachtsnaam">Beckett</name></persName>' n1 = Name().from_string(s) s = '<persName>Beckett, Samuel</persName>' n2 = Name().from_string(s) self.assertEqual(n1.sort_key(), n2.sort_key())
def test_from_xml(self): s ='<persName>Jelle <name type="geslachtsnaam">Gerbrandy</name></persName>' n = Name().from_string(s) # assert 0, etree.fromstring(s).xpath('//name[@type="geslachtsnaam"]') self.assertEqual(n.geslachtsnaam(), 'Gerbrandy') self.assertEqual(n.to_string(), s)
def test_volledige_naam(self): n = Name(voornaam='Jelle') self.assertEqual(n.get_volledige_naam(), 'Jelle') n.guess_geslachtsnaam() self.assertEqual(n.get_volledige_naam(), 'Jelle') n = Name().from_string("""<persName> <name type="voornaam">Hendrik IV</name> </persName>""") self.assertEqual(n.get_volledige_naam(), 'Hendrik IV') naam = Name(voornaam='Hendrik IV') self.assertEqual(naam.get_volledige_naam(), 'Hendrik IV')
def test_diacritics(self): n = Name(u'Wét').store_guessed_geslachtsnaam() el = etree.Element('test') #@UndefinedVariable el.text = u'Wét' s = u'<persName><name type="geslachtsnaam">W\xe9t</name></persName>' self.assertEqual(n.to_string(), s)
def test_insert_consituent(self): s1 = '<persName>Hugo <name type="intrapositie">de</name> <name type="geslachtsnaam">Groot</name></persName>' s2 = '<persName><name type="voornaam">Hugo</name> <name type="intrapositie">de</name> <name type="geslachtsnaam">Groot</name></persName>' s3 = '<persName>Hugo de Groot</persName>' s4 = '<persName>Hugo <name type="intrapositie">de</name> Groot</persName>' name = Name().from_string(s1) text = name.to_xml().text #check sanity self.assertEqual(text, 'Hugo ') m = re.match('Hugo', text) name._insert_constituent('voornaam', m) self.assertEqual(name.to_string(), s2) name = Name().from_string(s3) text = name.to_xml().text m = re.search('de', text) name._insert_constituent('intrapositie', m) self.assertEqual(name.to_string(), s4)
def test_guess_normal_form(self): self.assertEqual(Name('Arien A').guess_normal_form(), 'A, Arien'), for n, wanted_result in [ (Name().from_args(geslachtsnaam='A', volledige_naam='Arien A'), 'A, Arien'), (Name('Brugse Meester van 1493'), 'Brugse Meester van 1493'), (Name('Th.W. Engelmann'), 'Engelmann, Th.W.'), (Name('A. Algra'), 'Algra, A.'), # (Name().from_string('<persName>A. Algra</persName>'), 'Algra A.') (Name('(G. Morton)'), 'Morton, G.'), (Name('Di(e)ck, Jan Gerard'), 'Dick, Jan Gerard'), (Name('Arien A'), 'A, Arien'), (Name('David Heilbron Cz.'), 'Heilbron Cz., David'), (Name('Johann (Johan) VII'), 'Johann VII'), (Name('Johann VII'), 'Johann VII'), # (Name('koning Willem III') , 'Willem III' ), (Name('Crato, graaf van Nassau-Saarbrück'), 'Crato, graaf van Nassau-Saarbrück'), (Name('Wilhelmina van Pruisen - prinses van Oranje-Nassau'), 'Wilhelmina van Pruisen - prinses van Oranje-Nassau'), (Name('Henriette Adriana Louise Flora d\'Oultremont de Wégimont'), "d'Oultremont de Wégimont, Henriette Adriana Louise Flora"), (Name( 'Wolrat, vorst van Nassau-Usingen dikwijls genoemd Nassau-Saarbrück' ), 'Wolrat, vorst van Nassau-Usingen dikwijls genoemd Nassau-Saarbrück' ), (Name('van \'s-Gravezande, Arnoldus Corneluszn. Storm'), 's-Gravezande, Arnoldus Corneluszn. Storm, van'), (Name('L.T. graaf van Nassau La Lecq'), 'L.T. graaf van Nassau La Lecq'), (Name('Géo d\'Aconit'), 'd\'Aconit, Géo'), (Name('J. Heemskerk Azn.'), 'Heemskerk Azn., J.'), ]: guessed = n.guess_normal_form() self.assertEqual(guessed, wanted_result) self.assertEqual(Name('A').guess_normal_form(), 'A'), self.assertEqual( Name('Hendrik ten Brink Hz.').guess_normal_form(), 'Brink Hz., Hendrik ten'), n1 = etree.fromstring( '<persName>Kees van Dongen</persName>') #@UndefinedVariable n1 = Name().from_xml(n1) self.assertEqual(n1.guess_geslachtsnaam(), 'Dongen') self.assertEqual(n1.guess_normal_form(), 'Dongen, Kees van') n1 = etree.fromstring( '<persName>Dongen, Kees van</persName>') #@UndefinedVariable n1 = Name().from_xml(n1) self.assertEqual(n1.guess_normal_form(), 'Dongen, Kees van')
def test_extremes(self): self.assertEqual(Similarity.ratio(Name('XXX'), Name('XXX')), 1.0, Similarity.ratio(Name('XXX'), Name('XXX'), explain=1))
def test_constituent_tokens(self): s1 = 'koning Karel VI' t1 = [('koning', TYPE_TERRITORIAL), ('Karel', TYPE_GIVENNAME), ('VI', TYPE_GIVENNAME)] self.assertEqual(str(Name(s1)._guess_constituent_tokens()), str(t1)) s1 = 'Karel VI' t1 = [('Karel', TYPE_GIVENNAME), ('VI', TYPE_GIVENNAME)] self.assertEqual(str(Name(s1)._guess_constituent_tokens()), str(t1)) s1 = 'A.R. Bastiaensen CM' t1 = [('A.', TYPE_GIVENNAME), ('R.', TYPE_GIVENNAME), ('Bastiaensen', TYPE_FAMILYNAME), ('CM', TYPE_FAMILYNAME)] self.assertEqual(Name(s1)._guess_constituent_tokens(), t1) s1 = 'Willem III, graaf van Nassau' t1 = [('Willem', TYPE_GIVENNAME), ('III', TYPE_GIVENNAME), (',', ','), ('graaf', TYPE_TERRITORIAL), ('van', TYPE_TERRITORIAL), ('Nassau', TYPE_TERRITORIAL)] x = Name(s1)._guess_constituent_tokens() y = t1 self.assertEqual(x, y) s1 = 'Amelia van Nassau-Dietz' t1 = [('Amelia', 'voornaam'), ('van', 'intrapositie'), ('Nassau', 'geslachtsnaam'), ('-', 'geslachtsnaam'), ('Dietz', 'geslachtsnaam')] self.assertEqual(Name(s1)._guess_constituent_tokens(), t1) s1 = 'Johan IV van Nassau-Dillenburg' t1 = [('Johan', 'voornaam'), ('IV', 'voornaam'), ('van', 'intrapositie'), ('Nassau', 'geslachtsnaam'), ('-', 'geslachtsnaam'), ('Dillenburg', 'geslachtsnaam')] self.assertEqual(Name(s1)._guess_constituent_tokens(), t1) s1 = 'Maurits Lodewijk van Nassau La Lecq' t1 = [('Maurits', 'voornaam'), ('Lodewijk', 'voornaam'), ('van', 'intrapositie'), ('Nassau', 'geslachtsnaam'), ('La', 'geslachtsnaam'), ('Lecq', 'geslachtsnaam')] self.assertEqual(Name(s1)._guess_constituent_tokens(), t1) s1 = 'Mencía de Mendoza y Fonseca' t1 = [('Menc\xeda', 'voornaam'), ('de', 'intrapositie'), ('Mendoza', 'geslachtsnaam'), ('y', 'geslachtsnaam'), ('Fonseca', 'geslachtsnaam')] self.assertEqual(Name(s1)._guess_constituent_tokens(), t1) s1 = 'Wilhelmina van Pruisen - prinses van Oranje-Nassau' t1 = [('Wilhelmina', 'geslachtsnaam'), ('van', 'intrapositie'), ('Pruisen', 'geslachtsnaam'), ('-', '-'), ('prinses', 'territoriale_titel'), ('van', 'territoriale_titel'), ('Oranje', 'territoriale_titel'), ('-', '-'), ('Nassau', 'territoriale_titel')] self.assertEqual(Name(s1)._guess_constituent_tokens(), t1) s1 = 'Henriette Adriana Louise Flora d\'Oultremont de Wégimont' t1 = [('Henriette', 'voornaam'), ('Adriana', 'voornaam'), ('Louise', 'voornaam'), ('Flora', 'voornaam'), ("d'", 'geslachtsnaam'), ('Oultremont', 'geslachtsnaam'), ('de', 'intrapositie'), ('W\xe9gimont', 'geslachtsnaam')] self.assertEqual(Name(s1)._guess_constituent_tokens(), t1) s1 = 'Hendrik de Graaf' t1 = [('Hendrik', 'voornaam'), ('de', 'intrapositie'), ('Graaf', 'geslachtsnaam')] self.assertEqual(Name(s1)._guess_constituent_tokens(), t1) s1 = 'Hendrick graaf van Cuyck' t1 = [('Hendrick', 'geslachtsnaam'), ('graaf', 'territoriale_titel'), ('van', 'territoriale_titel'), ('Cuyck', 'territoriale_titel')] self.assertEqual(Name(s1)._guess_constituent_tokens(), t1) s1 = 'Hoeven, Abraham des Amorie van der' t1 = [('Hoeven', 'geslachtsnaam'), (',', ','), ('Abraham', 'voornaam'), ('des', 'intrapositie'), ('Amorie', 'geslachtsnaam'), ('van', 'intrapositie'), ('der', 'intrapositie')] self.assertEqual(Name(s1)._guess_constituent_tokens(), t1) s1 = 'Schwartzenberg, Johan Onuphrius thoe' t1 = [('Schwartzenberg', 'geslachtsnaam'), (',', ','), ('Johan', 'voornaam'), ('Onuphrius', 'voornaam'), ('thoe', 'intrapositie')] self.assertEqual(Name(s1)._guess_constituent_tokens(), t1)
def test_similarity(self): n1 = Name('Jelle Gerbrandy') n2 = Name('Jelle Gerbkandy') n3 = Name('Pietje Puk') n4 = Name('Jelle Gerbrandy', geslachtsnaam='Gerbrandy') n5 = Name('Piet Gerbrandy', geslachtsnaam='Gerbrandy') n6 = Name('Piet Gerbrandy', geslachtsnaam='Piet') self.assert_similarity_order([ Name('Jelle Gerbrandy'), Name('J. Gerbrandy'), Name('P. Gerbrandy') ]) self.assert_similarity_order([n1, n2, n3]) self.assert_similarity_order([n5, n4, n6]) #ik zoui graa gwillen dat Jelle Gerbrandy meer op J. Gerbrandy lijkt dan op Pelle Gerbrandy self.assert_similarity_order([ Name('Jelle Gerbrandy'), Name('J. Gerbrandy'), Name('P. Gerbrandy') ]) self.assert_similarity_order([ Name('Jansz., Willem'), Name('Jansz., Wouter'), Name('Jonge, Willem de'), ]) self.assert_similarity_order([ Name('Hermans, A.'), Name('Hermans'), Name('Hermansz'), Name('Hermans, P.'), ]) self.assert_similarity_order([ Name('Campen, Abraham Willem van'), Name('Kampen, Pieter Nicolaas van'), Name('Campensnieuwland, De Jonge van'), ]) self.assert_similarity_order([ Name('Kluit, Jan van'), Name('Kluyt, Jan'), Name('Kluyt, J.'), Name('Kluyt, Petrus'), # Name('Cluyt, Pieter'), Name('Cluts, Daniël'), ]) self.assert_similarity_order([ Name('vaal'), Name('Vaal, Jacob'), Name().from_string('<persName><name type="geslachtsnaam">Waal</name>, Henri van de</persName>'), Name('Waal, Henri van de'), ]) self.assert_similarity_order([ Name('gerbrandy'), Name('Gerbrandij, Pieter'), Name('Gerbrandus'), ]) self.assert_similarity_order([ Name('Haack, Simon'), Name('Haak, Simon'), Name('Haack, Petrus'), ]) #oldenbarnevelt lijkt (ongeveer) evenveel op het ena als op de andere self.assert_similarity_order([ Name('oldenbarnevelt'), Name('Oldenbarnevelt, Willem van '), Name('Oldenbarnevelt, dr. Johan van'), ]) self.assert_similarity_order([ Name('Hendrik IV'), Name('Hendrick IV'), Name('Hendrik V'), Name('Filips IV'), Name('Hendrik'), ]) self.assert_similarity_order([ Name(voornaam='(Hans) Christian'), Name(voornaam='Christian'), Name('Johan Christiaan'), ]) self.assert_similarity_order([ Name('Aerssen-Walta, Lucia van'), Name('Walta, Lucia van'), Name('Aerssens, Lucia van'), Name('Harselaar, Willem van'), Name('St. Luc, Jacques de'), ]) self.assert_similarity_order([ Name('Constant Rebecque De Villars, Jules Thierry Nicolas baron de'), Name('Constant Rebecque, J.V. baron de'), Name('Constant Rebecque, Mr. Charles Theodore Jean baron de'), # Name('Constantijn'), # Name('Rebecque, J.F. de Constant'), ]) self.assert_similarity_order([ Name('Willem III'), Name('koning Willem III'), Name('Willem'), ]) self.assert_similarity_order([ Name("Pierre de l'Oyseleur dit de Villiers, (hof)predikant"), Name("L'Oyseleur Dit de Villiers, Pierre "), Name('Villiers, Anne'), Name('Philips'), # Name('Willem III'), ]) #deze heeft een larger score dan heel veel and benchmark = (Name('Craen, Anna'), Name('Craen, Andrea')) benchmark_top = (Name('Jacob Dirks'), Name('Dirks, Mr. Jacob')) self.assert_more_similar([ (Name('Engelmann, Theodoor Wilhelm'), Name('Th.W. Engelmann')), benchmark ]) self.assert_more_similar([ (Name('Borret, Theodorus Josephus Hubertus'), Name('Borret, Prof. Dr. Theodoor Joseph Hubert')), benchmark ]) self.assert_more_similar([ (Name('Buyts, Helena'), Name('Buydts, Helena ')), benchmark ]) self.assert_more_similar([ (Name('Craen, Jelle Douwe'), Name('Craen, Jelle')), benchmark ]) self.assert_more_similar([ (Name('Herman Johan Royaards'), Name('Royaards, Hermannus')), benchmark ]) self.assert_more_similar([ (Name('Carl Peter Thunberg'), Name('Thunberg, Dr. Karl Peter')), benchmark ]) self.assert_more_similar([ (Name('Johanness Henricus Scholten'), Name('Scholten, J.H.')), # (Name('Johanness Henricus Scholten'), Name('Scholten, J.')), benchmark ]) self.assert_more_similar([ (Name('Johannes Steenmeijer'), Name('Steenmeyer, Johannes')), benchmark ]) self.assert_more_similar([ (Name('Johannes Stéénmeijer'), Name('Steenmeyer, Johannes')), benchmark ]) self.assert_more_similar([ (Name('Maria, gravin van Nassau (1)'), Name('Maria, gravin van Nassau (2)')), benchmark ]) self.assert_more_similar([ (Name(geslachtsnaam='des Amorie van der Hoeven', voornaam='Abraham'), Name('Hoeven, Abraham des Amorie van der (1)')), benchmark ]) self.assert_more_similar([ (Name('oldenbarnevelt'), Name('Oldenbarnevelt, dr. Johan van')), # benchmark, ]) self.assert_more_similar([ (Name('oldenbarnevelt'), Name('Oldenbarnevelt, Willem van ')), benchmark, ]) self.assert_more_similar([ (Name('Prof. Dr. Ing. Jhr. Johan Brootjens'), Name('Johan Brootjens')), benchmark, ]) self.assert_more_similar([ (Name('Apostool, C.'), Name('Cornelis Apostool')), benchmark, ]) self.assert_more_similar([ benchmark_top, (Name('Feith, Rhijnvis'), Name('Feith, Johan Adriaan' )), ])
def test_tokenize(self): s1 = '<persName>Hugo <name type="intrapositie">de</name> <name type="geslachtsnaam">Groot</name></persName>' t1 = [ Token('Hugo', None, tail=' '), Token('de', 'intrapositie', tail=' '), Token('Groot', 'geslachtsnaam') ] s2 = '<persName><name type="voornaam">Hugo</name> <name type="intrapositie">de</name> <name type="geslachtsnaam">Groot</name></persName>' t2 = [ Token('Hugo', 'voornaam', tail=' '), Token('de', 'intrapositie', tail=' '), Token('Groot', 'geslachtsnaam') ] s3 = '<persName>Hugo de Groot</persName>' t3 = [ Token('Hugo', None, tail=' '), Token('de', None, tail=' '), Token('Groot', None) ] s4 = '<persName>Groot, Hugo</persName>' t4 = [ Token('Groot', None), Token(',', None, tail=' '), Token('Hugo', None) ] s5 = '<persName>H.P. de Groot</persName>' t5 = [ Token('H.', None), Token('P.', None, tail=' '), Token('de', None, tail=' '), Token('Groot', None) ] self.assertEqual(Name().from_string(s1)._tokenize(), t1) self.assertEqual(Name().from_string(s1)._tokenize()[0].tail(), ' ') self.assertEqual(Name().from_string(s1)._tokenize()[1].tail(), ' ') self.assertEqual(Name().from_string(s1)._tokenize()[2].tail(), '') self.assertEqual(Name().from_string(s2)._tokenize(), t2) self.assertEqual(Name().from_string(s3)._tokenize(), t3) self.assertEqual(Name().from_string(s4)._tokenize(), t4) self.assertEqual(Name().from_string(s5)._tokenize(), t5) self.assertEqual(etree.tostring(Name()._detokenize(t1)), s1) #@UndefinedVariable self.assertEqual(etree.tostring(Name()._detokenize(t2)), s2) #@UndefinedVariable self.assertEqual(etree.tostring(Name()._detokenize(t3)), s3) #@UndefinedVariable self.assertEqual(etree.tostring(Name()._detokenize(t4)), s4) #@UndefinedVariable self.assertEqual(etree.tostring(Name()._detokenize(t5)), s5) #@UndefinedVariable s = '<persName>Beter (met haakjes)</persName>' t = [ Token('Beter', None, tail=' '), Token('(', tail=''), Token('met', tail=' '), Token('haakjes'), Token(')', None) ] self.assertEqual(Name().from_string(s)._tokenize(), t) self.assertEqual(etree.tostring(Name()._detokenize(t)), s) #@UndefinedVariable s = '<persName>C.H.Veenstra</persName>' t = [Token('C.', None), Token('H.', None), Token('Veenstra', None)] self.assertEqual(Name().from_string(s)._tokenize(), t) self.assertEqual( Name("l'Abc")._tokenize(), [("l'", None), ('Abc', None)])