示例#1
0
def test_lemmatize_unicode():
    _check_corenlp()
    lines = parse(u"\u0540\u0578\u057e\u056b\u056f hit C\xe9sar",
                  annotators=['tokenize', 'ssplit', 'pos', 'lemma'])
    saf = stanford_to_saf(lines)
    assert_equal({t['lemma'] for t in saf['tokens']},
                 {'Cesar', 'hit', 'Hovik'})
示例#2
0
def test_lemmatize_unicode():
    _check_corenlp()
    lines = parse(u"H\xf6v\xedk hit C\xe9sar",
                  annotators=['tokenize', 'ssplit', 'pos', 'lemma'])
    saf = stanford_to_saf(lines)
    assert_equal({t['lemma'] for t in saf['tokens']},
                 {'Cesar', 'hit', 'Hovik'})
示例#3
0
def test_ner():
    _check_corenlp()
    annotators = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner']
    saf = stanford_to_saf(parse("John lives in Amsterdam",
                                annotators=annotators))
    lemmata = {t['id']: t['lemma'] for t in saf['tokens']}
    entities = {lemmata[e['tokens'][0]]: e['type'] for e in saf['entities']}
    assert_equal(entities, {'John': 'PERSON', 'Amsterdam': 'LOCATION'})
示例#4
0
def test_ner():
    _check_corenlp()
    annotators = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner']
    saf = stanford_to_saf(parse("John lives in Amsterdam",
                                annotators=annotators))
    lemmata = {t['id']: t['lemma'] for t in saf['tokens']}
    entities = {lemmata[e['tokens'][0]]: e['type'] for e in saf['entities']}
    assert_equal(entities, {'John': 'PERSON', 'Amsterdam': 'LOCATION'})
示例#5
0
def test_raw():
    _check_corenlp()
    lines = parse("It. Works\n", annotators=['tokenize', 'ssplit'])
    expected = ['Sentence #1 (2 tokens):', 'It.',
                '[Text=It CharacterOffsetBegin=0 CharacterOffsetEnd=2]'
                ' [Text=. CharacterOffsetBegin=2 CharacterOffsetEnd=3]',
                'Sentence #2 (1 tokens):', 'Works',
                '[Text=Works CharacterOffsetBegin=4 CharacterOffsetEnd=9]']
    assert_equal(lines, expected)
示例#6
0
def test_lemmatize():
    _check_corenlp()
    lines = parse("He jumped. \n\n Cool!",
                  annotators=['tokenize', 'ssplit', 'pos', 'lemma'])
    saf = stanford_to_saf(lines)
    assert_equal(set(saf.keys()), {'tokens', 'header'})

    assert_equal({t['lemma']
                  for t in saf['tokens']}, {'he', 'jump', 'cool', '!', '.'})
    assert_equal({t['sentence'] for t in saf['tokens']}, {1, 2})
示例#7
0
def test_lemmatize():
    _check_corenlp()
    lines = parse("He jumped. \n\n Cool!",
                  annotators=['tokenize', 'ssplit', 'pos', 'lemma'])
    saf = stanford_to_saf(lines)
    assert_equal(set(saf.keys()), {'tokens', 'header'})

    assert_equal({t['lemma'] for t in saf['tokens']},
                 {'he', 'jump', 'cool', '!', '.'})
    assert_equal({t['sentence'] for t in saf['tokens']},
                 {1, 2})
示例#8
0
def test_raw():
    _check_corenlp()
    lines = parse("It. Works\n", annotators=['tokenize', 'ssplit'])
    expected = [
        'Sentence #1 (2 tokens):', 'It.',
        '[Text=It CharacterOffsetBegin=0 CharacterOffsetEnd=2]'
        ' [Text=. CharacterOffsetBegin=2 CharacterOffsetEnd=3]',
        'Sentence #2 (1 tokens):', 'Works',
        '[Text=Works CharacterOffsetBegin=4 CharacterOffsetEnd=9]'
    ]
    assert_equal(lines, expected)
示例#9
0
def test_parse():
    _check_corenlp()
    saf = stanford_to_saf(parse("John loves himself"))
    lemmata = {t['id']: t['lemma'] for t in saf['tokens']}
    assert_equal(saf['trees'], [{
        "tree": "(ROOT (S (NP (NNP John)) (VP (VBZ loves) "
                "(NP (PRP himself)))))",
        "sentence": 1
        }])
    deps = {(lemmata[d['child']], lemmata[d['parent']], d['relation'])
            for d in saf['dependencies']}
    assert_equal(deps, {('John', 'love', 'nsubj'),
                        ('himself', 'love', 'dobj')})
    corefs = {tuple(sorted([lemmata[c[0][0]], lemmata[c[1][0]]]))
              for c in saf['coreferences']}
    assert_equal(corefs, {tuple(sorted(['John', 'himself']))})
示例#10
0
def test_parse():
    _check_corenlp()
    saf = stanford_to_saf(parse("John loves himself"))
    lemmata = {t['id']: t['lemma'] for t in saf['tokens']}
    assert_equal(saf['trees'], [{
        "tree": "(ROOT (S (NP (NNP John)) (VP (VBZ loves) "
                "(NP (PRP himself)))))",
        "sentence": 1
        }])
    deps = {(lemmata[d['child']], lemmata[d['parent']], d['relation'])
            for d in saf['dependencies']}
    assert_equal(deps, {('John', 'love', 'nsubj'),
                        ('himself', 'love', 'dobj')})
    corefs = {tuple(sorted([lemmata[c[0][0]], lemmata[c[1][0]]]))
              for c in saf['coreferences']}
    assert_equal(corefs, {tuple(sorted(['John', 'himself']))})
示例#11
0
def test_multiple_sentences():
    _check_corenlp()
    p = parse("John lives in Amsterdam. He works in London")
    saf = stanford_to_saf(p)
    tokens = {t['id'] : t for t in saf['tokens']}
    # are token ids unique?
    assert_equal(len(tokens), len(saf['tokens']))
    # is location in second sentence correct?
    entities = {tokens[e['tokens'][0]]['lemma'] : e['type']
                for e in saf['entities']}
    assert_in(('London', 'LOCATION'), entities.items())
    # is dependency in second sentence correct?
    rels = [(tokens[rel['child']]['lemma'], rel['relation'], tokens[rel['parent']]['lemma'])
            for rel in saf['dependencies']]
    assert_in(("he", "nsubj", "work"), rels)
    assert_in(("John", "nsubj", "live"), rels)
    # is coref parsed correctly?
    coref = {(tokens[x[0][0]]['lemma'], tokens[x[1][0]]['lemma']) for x in saf['coreferences']}
    assert_equal(coref, {("John", "he")})
示例#12
0
def test_multiple_sentences():
    _check_corenlp()
    p = parse("John lives in Amsterdam. He works in London")
    saf = stanford_to_saf(p)
    tokens = {t['id']: t for t in saf['tokens']}
    # are token ids unique?
    assert_equal(len(tokens), len(saf['tokens']))
    # is location in second sentence correct?
    entities = {tokens[e['tokens'][0]]['lemma']: e['type']
                for e in saf['entities']}
    assert_in(('London', 'LOCATION'), entities.items())
    # is dependency in second sentence correct?
    rels = [(tokens[rel['child']]['lemma'], rel['relation'],
             tokens[rel['parent']]['lemma'])
            for rel in saf['dependencies']]
    assert_in(("he", "nsubj", "work"), rels)
    assert_in(("John", "nsubj", "live"), rels)
    # is coref parsed correctly?
    coref = {(tokens[x[0][0]]['lemma'], tokens[x[1][0]]['lemma'])
             for x in saf['coreferences']}
    assert_equal(coref, {("John", "he")})