Пример #1
0
def test_corenlp2naf():
    xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read()
    naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER)
    naf = KafNafParser(BytesIO(naf_bytes))
    terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()}
    assert_equal(set(terms.values()), {"John", "attack", "I", "in", "London", "hit", "he", "back", "."})
    london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0]
    assert_equal(london.get_pos(), 'R')
    assert_equal(london.get_morphofeat(), 'NNP')

    ents = {}
    for e in naf.get_entities():
        for ref in e.get_references():
            for term_id in ref.get_span().get_span_ids():
                ents[terms[term_id]] = e.get_type()
    assert_equal(ents, {"John": "PERSON", "London": "LOCATION"})

    deps = {terms[d.get_from()]: (d.get_function(), terms[d.get_to()])
            for d in naf.get_dependencies()}
    expected = {'I': ('nsubj', 'hit'),
                'John': ('nsubj', 'attack'),
                'London': ('prep_in', 'attack'),
                'back': ('advmod', 'hit'),
                'he': ('dobj', 'hit')}
    assert_equal(deps, expected)

    corefs = []
    for coref in naf.get_corefs():
        corefs.append(set())
        for span in coref.get_spans():
            corefs[-1] |= {terms[t] for t in span.get_span_ids()}
    assert_in({"John", "he"}, corefs)
Пример #2
0
def test_corenlp2naf():
    xml = open(os.path.join(os.path.dirname(__file__),
                            "test_corenlp.xml")).read()
    naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER)
    naf = KafNafParser(BytesIO(naf_bytes))
    terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()}
    assert_equal(
        set(terms.values()),
        {"John", "attack", "I", "in", "London", "hit", "he", "back", "."})
    london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0]
    assert_equal(london.get_pos(), 'R')
    assert_equal(london.get_morphofeat(), 'NNP')

    ents = {}
    for e in naf.get_entities():
        for ref in e.get_references():
            for term_id in ref.get_span().get_span_ids():
                ents[terms[term_id]] = e.get_type()
    assert_equal(ents, {"John": "PERSON", "London": "LOCATION"})

    deps = {
        terms[d.get_from()]: (d.get_function(), terms[d.get_to()])
        for d in naf.get_dependencies()
    }
    expected = {
        'I': ('nsubj', 'hit'),
        'John': ('nsubj', 'attack'),
        'London': ('prep_in', 'attack'),
        'back': ('advmod', 'hit'),
        'he': ('dobj', 'hit')
    }
    assert_equal(deps, expected)

    corefs = []
    for coref in naf.get_corefs():
        corefs.append(set())
        for span in coref.get_spans():
            corefs[-1] |= {terms[t] for t in span.get_span_ids()}
    assert_in({"John", "he"}, corefs)
Пример #3
0
#freqdict = defaultdict(int) #create a dictionary to store frequencies, goes with simpledict, also commented out
#for token in my_parser.get_tokens():
#    freqdict[token.get_text()] += 1

# In[153]:

#simpledict ={}

for root, dirs, files in os.walk(
        "./"):  #walk through the working directory and look for KAF files
    for f in files:
        if f.endswith('.kaf'):
            output = "ENT_ID\tTYPE\tTERM\tKWIC\n"
            print "Reading: ", f
            my_parser = KafNafParser(f)  #create the KafNafParser
            for entity in my_parser.get_entities(
            ):  #get the entities identified through NER
                if entity.get_type(
                ) == "LOCATION":  #if it's a location, then let's get its term references
                    entity_type = entity.get_type()  #get the entity type
                    entity_id = entity.get_id()  #get the entity id
                    ref = entity.get_node().find(
                        "references"
                    )  #find the references node in the KAF file
                    targets = ref.find("span").findall(
                        "target"
                    )  #get a list of all the targets in the references node
                    term_id = targets[0].attrib[
                        "id"]  #get the first target term id, so we can generate the term text in the next two lines
                    word_id = my_parser.get_term(term_id).get_node().find(
                        "span").find("target").attrib["id"]  #get the word id
                    word = my_parser.get_token(
Пример #4
0
import csv
import sys
from KafNafParserPy import KafNafParser

from naflib import *
from naflib import sort_terms

woorden = [r['original'] for r in csv.DictReader(open("klimaatwoorden.csv"))]

o = csv.writer(sys.stdout)
o.writerow(["file", "sentence", "entity", "type", "dbpedia", "text"])
for fn in sys.argv[1:]:
    naf = KafNafParser(fn)
    for e in naf.get_entities():
        for ref in e.get_references():
            terms = sort_terms(
                naf, [naf.get_term(t.get_id()) for t in ref.get_span()])
            o.writerow([
                fn,
                get_sentence(naf, terms[0]),
                e.get_id(),
                e.get_type(), " ".join(t.get_lemma() for t in terms)
            ])