def test_lemmatize_unicode(): _check_corenlp() lines = parse(u"H\xf6v\xedk hit C\xe9sar", annotators=['tokenize', 'ssplit', 'pos', 'lemma']) saf = stanford_to_saf(lines) assert_equal({t['lemma'] for t in saf['tokens']}, {'Cesar', 'hit', 'Hovik'})
def test_lemmatize_unicode(): _check_corenlp() lines = parse(u"\u0540\u0578\u057e\u056b\u056f hit C\xe9sar", annotators=['tokenize', 'ssplit', 'pos', 'lemma']) saf = stanford_to_saf(lines) assert_equal({t['lemma'] for t in saf['tokens']}, {'Cesar', 'hit', 'Hovik'})
def test_ner(): _check_corenlp() annotators = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner'] saf = stanford_to_saf(parse("John lives in Amsterdam", annotators=annotators)) lemmata = {t['id']: t['lemma'] for t in saf['tokens']} entities = {lemmata[e['tokens'][0]]: e['type'] for e in saf['entities']} assert_equal(entities, {'John': 'PERSON', 'Amsterdam': 'LOCATION'})
def test_parse_xml(): xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read() saf = stanford_to_saf(xml) assert_equal({t['lemma'] for t in saf['tokens']}, {"John", "attack", "I", "in", "London", "hit", "he", "back", "."}) london = [t for t in saf['tokens'] if t['lemma'] == 'London'][0] assert_equal(london['pos'], 'NNP') assert_in({"type": "LOCATION", "tokens": [london['id']]}, saf['entities'])
def test_parse_xml(): with open(join(dirname(__file__), "test_corenlp.xml")) as f: xml = f.read() saf = stanford_to_saf(xml) assert_equal({t['lemma'] for t in saf['tokens']}, set("John attack I in London hit he back .".split())) london = [t for t in saf['tokens'] if t['lemma'] == 'London'][0] assert_equal(london['pos'], 'NNP') assert_in({"type": "LOCATION", "tokens": [london['id']]}, saf['entities'])
def test_lemmatize(): _check_corenlp() lines = parse("He jumped. \n\n Cool!", annotators=['tokenize', 'ssplit', 'pos', 'lemma']) saf = stanford_to_saf(lines) assert_equal(set(saf.keys()), {'tokens', 'header'}) assert_equal({t['lemma'] for t in saf['tokens']}, {'he', 'jump', 'cool', '!', '.'}) assert_equal({t['sentence'] for t in saf['tokens']}, {1, 2})
def test_parse_xml(): xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read() saf = stanford_to_saf(xml) assert_equal( {t['lemma'] for t in saf['tokens']}, {"John", "attack", "I", "in", "London", "hit", "he", "back", "."}) london = [t for t in saf['tokens'] if t['lemma'] == 'London'][0] assert_equal(london['pos'], 'NNP') assert_in({"type": "LOCATION", "tokens": [london['id']]}, saf['entities'])
def test_parse(): _check_corenlp() saf = stanford_to_saf(parse("John loves himself")) lemmata = {t['id']: t['lemma'] for t in saf['tokens']} assert_equal(saf['trees'], [{ "tree": "(ROOT (S (NP (NNP John)) (VP (VBZ loves) " "(NP (PRP himself)))))", "sentence": 1 }]) deps = {(lemmata[d['child']], lemmata[d['parent']], d['relation']) for d in saf['dependencies']} assert_equal(deps, {('John', 'love', 'nsubj'), ('himself', 'love', 'dobj')}) corefs = {tuple(sorted([lemmata[c[0][0]], lemmata[c[1][0]]])) for c in saf['coreferences']} assert_equal(corefs, {tuple(sorted(['John', 'himself']))})
def test_multiple_sentences(): _check_corenlp() p = parse("John lives in Amsterdam. He works in London") saf = stanford_to_saf(p) tokens = {t['id'] : t for t in saf['tokens']} # are token ids unique? assert_equal(len(tokens), len(saf['tokens'])) # is location in second sentence correct? entities = {tokens[e['tokens'][0]]['lemma'] : e['type'] for e in saf['entities']} assert_in(('London', 'LOCATION'), entities.items()) # is dependency in second sentence correct? rels = [(tokens[rel['child']]['lemma'], rel['relation'], tokens[rel['parent']]['lemma']) for rel in saf['dependencies']] assert_in(("he", "nsubj", "work"), rels) assert_in(("John", "nsubj", "live"), rels) # is coref parsed correctly? coref = {(tokens[x[0][0]]['lemma'], tokens[x[1][0]]['lemma']) for x in saf['coreferences']} assert_equal(coref, {("John", "he")})
def test_multiple_sentences(): _check_corenlp() p = parse("John lives in Amsterdam. He works in London") saf = stanford_to_saf(p) tokens = {t['id']: t for t in saf['tokens']} # are token ids unique? assert_equal(len(tokens), len(saf['tokens'])) # is location in second sentence correct? entities = {tokens[e['tokens'][0]]['lemma']: e['type'] for e in saf['entities']} assert_in(('London', 'LOCATION'), entities.items()) # is dependency in second sentence correct? rels = [(tokens[rel['child']]['lemma'], rel['relation'], tokens[rel['parent']]['lemma']) for rel in saf['dependencies']] assert_in(("he", "nsubj", "work"), rels) assert_in(("John", "nsubj", "live"), rels) # is coref parsed correctly? coref = {(tokens[x[0][0]]['lemma'], tokens[x[1][0]]['lemma']) for x in saf['coreferences']} assert_equal(coref, {("John", "he")})