def test_simple_tokenize_handles_unicode(): name = u'Dœ, Jöhn Π.' result = simple_tokenize(name) expected = { 'lastnames': [NameToken(u'dœ')], 'nonlastnames': [NameToken(u'jöhn'), NameInitial(u'π')], } assert result == expected
def author_tokenize(name): """This is how the name should be tokenized for the matcher.""" phrases = scan_author_string_for_phrases(name) res = {'lastnames': [], 'nonlastnames': []} for key, tokens in phrases.items(): lst = res.get(key) if lst is None: continue for token in tokens: if len(token) == 1: lst.append(NameInitial(token)) else: lst.append(NameToken(token)) return res