예제 #1
0
def test_simple_tokenize_handles_unicode():
    name = u'Dœ, Jöhn Π.'

    result = simple_tokenize(name)
    expected = {
        'lastnames': [NameToken(u'dœ')],
        'nonlastnames': [NameToken(u'jöhn'), NameInitial(u'π')],
    }

    assert result == expected
예제 #2
0
def author_tokenize(name):
    """This is how the name should be tokenized for the matcher."""
    phrases = scan_author_string_for_phrases(name)
    res = {'lastnames': [], 'nonlastnames': []}
    for key, tokens in phrases.items():
        lst = res.get(key)
        if lst is None:
            continue
        for token in tokens:
            if len(token) == 1:
                lst.append(NameInitial(token))
            else:
                lst.append(NameToken(token))
    return res