def test_corenlp2naf(): xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read() naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER) naf = KafNafParser(BytesIO(naf_bytes)) terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()} assert_equal(set(terms.values()), {"John", "attack", "I", "in", "London", "hit", "he", "back", "."}) london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0] assert_equal(london.get_pos(), 'R') assert_equal(london.get_morphofeat(), 'NNP') ents = {} for e in naf.get_entities(): for ref in e.get_references(): for term_id in ref.get_span().get_span_ids(): ents[terms[term_id]] = e.get_type() assert_equal(ents, {"John": "PERSON", "London": "LOCATION"}) deps = {terms[d.get_from()]: (d.get_function(), terms[d.get_to()]) for d in naf.get_dependencies()} expected = {'I': ('nsubj', 'hit'), 'John': ('nsubj', 'attack'), 'London': ('prep_in', 'attack'), 'back': ('advmod', 'hit'), 'he': ('dobj', 'hit')} assert_equal(deps, expected) corefs = [] for coref in naf.get_corefs(): corefs.append(set()) for span in coref.get_spans(): corefs[-1] |= {terms[t] for t in span.get_span_ids()} assert_in({"John", "he"}, corefs)
def test_corenlp2naf(): xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read() naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER) naf = KafNafParser(BytesIO(naf_bytes)) terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()} assert_equal( set(terms.values()), {"John", "attack", "I", "in", "London", "hit", "he", "back", "."}) london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0] assert_equal(london.get_pos(), 'R') assert_equal(london.get_morphofeat(), 'NNP') ents = {} for e in naf.get_entities(): for ref in e.get_references(): for term_id in ref.get_span().get_span_ids(): ents[terms[term_id]] = e.get_type() assert_equal(ents, {"John": "PERSON", "London": "LOCATION"}) deps = { terms[d.get_from()]: (d.get_function(), terms[d.get_to()]) for d in naf.get_dependencies() } expected = { 'I': ('nsubj', 'hit'), 'John': ('nsubj', 'attack'), 'London': ('prep_in', 'attack'), 'back': ('advmod', 'hit'), 'he': ('dobj', 'hit') } assert_equal(deps, expected) corefs = [] for coref in naf.get_corefs(): corefs.append(set()) for span in coref.get_spans(): corefs[-1] |= {terms[t] for t in span.get_span_ids()} assert_in({"John", "he"}, corefs)
#freqdict = defaultdict(int) #create a dictionary to store frequencies, goes with simpledict, also commented out #for token in my_parser.get_tokens(): # freqdict[token.get_text()] += 1 # In[153]: #simpledict ={} for root, dirs, files in os.walk( "./"): #walk through the working directory and look for KAF files for f in files: if f.endswith('.kaf'): output = "ENT_ID\tTYPE\tTERM\tKWIC\n" print "Reading: ", f my_parser = KafNafParser(f) #create the KafNafParser for entity in my_parser.get_entities( ): #get the entities identified through NER if entity.get_type( ) == "LOCATION": #if it's a location, then let's get its term references entity_type = entity.get_type() #get the entity type entity_id = entity.get_id() #get the entity id ref = entity.get_node().find( "references" ) #find the references node in the KAF file targets = ref.find("span").findall( "target" ) #get a list of all the targets in the references node term_id = targets[0].attrib[ "id"] #get the first target term id, so we can generate the term text in the next two lines word_id = my_parser.get_term(term_id).get_node().find( "span").find("target").attrib["id"] #get the word id word = my_parser.get_token(
import csv import sys from KafNafParserPy import KafNafParser from naflib import * from naflib import sort_terms woorden = [r['original'] for r in csv.DictReader(open("klimaatwoorden.csv"))] o = csv.writer(sys.stdout) o.writerow(["file", "sentence", "entity", "type", "dbpedia", "text"]) for fn in sys.argv[1:]: naf = KafNafParser(fn) for e in naf.get_entities(): for ref in e.get_references(): terms = sort_terms( naf, [naf.get_term(t.get_id()) for t in ref.get_span()]) o.writerow([ fn, get_sentence(naf, terms[0]), e.get_id(), e.get_type(), " ".join(t.get_lemma() for t in terms) ])