def convert(self, id, result, format): assert format == "csv" _int = lambda x: None if x is None else int(x) naf = KafNafParser(BytesIO(result.encode("utf-8"))) deps = {dep.get_to(): (dep.get_function(), dep.get_from()) for dep in naf.get_dependencies()} tokendict = {token.get_id(): token for token in naf.get_tokens()} s = StringIO() w = csv.writer(s) w.writerow(["id", "token_id", "offset", "sentence", "para", "word", "term_id", "lemma", "pos", "pos1", "parent", "relation"]) for term in naf.get_terms(): tokens = [tokendict[id] for id in term.get_span().get_span_ids()] for token in tokens: tid = term.get_id() pos = term.get_pos() pos1 = POSMAP[pos] row = [id, token.get_id(), _int(token.get_offset()), _int(token.get_para()), token.get_text(), tid, term.get_lemma(), pos, pos1] if tid in deps: rel, parent = deps[tid] row += [parent, rel.split("/")[-1]] else: row += [None, None] w.writerow(row) return s.getvalue()
def test_corenlp2naf(): xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read() naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER) naf = KafNafParser(BytesIO(naf_bytes)) terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()} assert_equal(set(terms.values()), {"John", "attack", "I", "in", "London", "hit", "he", "back", "."}) london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0] assert_equal(london.get_pos(), 'R') assert_equal(london.get_morphofeat(), 'NNP') ents = {} for e in naf.get_entities(): for ref in e.get_references(): for term_id in ref.get_span().get_span_ids(): ents[terms[term_id]] = e.get_type() assert_equal(ents, {"John": "PERSON", "London": "LOCATION"}) deps = {terms[d.get_from()]: (d.get_function(), terms[d.get_to()]) for d in naf.get_dependencies()} expected = {'I': ('nsubj', 'hit'), 'John': ('nsubj', 'attack'), 'London': ('prep_in', 'attack'), 'back': ('advmod', 'hit'), 'he': ('dobj', 'hit')} assert_equal(deps, expected) corefs = [] for coref in naf.get_corefs(): corefs.append(set()) for span in coref.get_spans(): corefs[-1] |= {terms[t] for t in span.get_span_ids()} assert_in({"John", "he"}, corefs)
def from_naf(self, article, naf): def _int(x): return None if x is None else int(x) naf = KafNafParser(BytesIO(naf.encode("utf-8"))) deps = {dep.get_to(): (dep.get_function(), dep.get_from()) for dep in naf.get_dependencies()} tokendict = {token.get_id(): token for token in naf.get_tokens()} for term in naf.get_terms(): tokens = [tokendict[id] for id in term.get_span().get_span_ids()] for token in tokens: tid = term.get_id() tok = {"aid": article, "token_id": token.get_id(), "offset": _int(token.get_offset()), "sentence": _int(token.get_sent()), "para": _int(token.get_para()), "word": token.get_text(), "term_id": tid, "lemma": term.get_lemma(), "pos": term.get_pos()} if tid in deps: rel, parent = deps[tid] tok['parent'] = parent tok['relation'] = rel.split("/")[-1] yield tok
def test_corenlp2naf(): xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read() naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER) naf = KafNafParser(BytesIO(naf_bytes)) terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()} assert_equal( set(terms.values()), {"John", "attack", "I", "in", "London", "hit", "he", "back", "."}) london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0] assert_equal(london.get_pos(), 'R') assert_equal(london.get_morphofeat(), 'NNP') ents = {} for e in naf.get_entities(): for ref in e.get_references(): for term_id in ref.get_span().get_span_ids(): ents[terms[term_id]] = e.get_type() assert_equal(ents, {"John": "PERSON", "London": "LOCATION"}) deps = { terms[d.get_from()]: (d.get_function(), terms[d.get_to()]) for d in naf.get_dependencies() } expected = { 'I': ('nsubj', 'hit'), 'John': ('nsubj', 'attack'), 'London': ('prep_in', 'attack'), 'back': ('advmod', 'hit'), 'he': ('dobj', 'hit') } assert_equal(deps, expected) corefs = [] for coref in naf.get_corefs(): corefs.append(set()) for span in coref.get_spans(): corefs[-1] |= {terms[t] for t in span.get_span_ids()} assert_in({"John", "he"}, corefs)
def _test_file(this_file): input_fd = open(this_file) result = subprocess.check_output(os.path.join(__here__,'run_parser.sh'), stdin=input_fd) my_obj = KafNafParser(BytesIO(result)) #Check the terms terms = [term for term in my_obj.get_terms()] assert_equal(len(terms),12) assert_equal(my_obj.get_term('t_4').get_lemma(),'mooi') assert_equal(my_obj.get_term('t_4').get_pos(),'adj') #Check constituents trees = [tree for tree in my_obj.get_trees()] assert_equal(len(trees),2) assert_equal(trees[0].get_terminals_as_list()[1].get_span().get_span_ids(),['t_1']) #Check dependencies dependencies = [dep for dep in my_obj.get_dependencies()] assert_equal(len(dependencies),10) assert_equal(dependencies[5].get_function(),'hd/su')
input = sys.stdin my_parser = KafNafParser(input) ### We first need a list of the predicates that we want to create feature vectors for predicates = {} for term_obj in my_parser.get_terms(): predicate = re.match("WW", term_obj.get_morphofeat()) if predicate is not None: predicates[term_obj.get_id()] = term_obj.get_pos() #print term_obj.get_id(), term_obj.get_morphofeat(), term_obj.get_lemma() # We need the dependencies to find out the structure of the argument patterns # and also to know which verbs are auxiliary verbs and which ones are main verbs dependencies = {} for dep_obj in my_parser.get_dependencies(): relparts = dep_obj.get_function().split('/') rel_from = relparts[0] rel_to = relparts[1] dep_id = dep_obj.get_from() + '-' + dep_obj.get_to() dependencies[dep_id] = dep_obj.get_function() # We also want to distinguish between main verbs and auxiliary verbs # for this we first gather all dependency patterns and store those dep_patterns = {} for deps in dependencies: dep_parts = deps.split('-') #dep_patterns[dep_parts[0]] = [] func_parts = dependencies[deps].split('/') if predicates.get(dep_parts[0]) is not None: if dependencies[deps] != '-- / --':
def load_naf_stdin(): """Load a dataset in NAF format. Use this function to create a new ConlluDataset from a NAF file, read from stdin. NOTE: you can only add to NAF files, not create one from scratch. """ my_parser = KafNafParser(sys.stdin) my_dataset = ConlluDataset() # a big look-up table: for any NAF id, return a hash with # {sent_id, token_id} in the ConlluDataset naf2conll_id = {} # collect the sentences in a hash, indexed by token_obj.get_sent() sentences = {} # iterate over the tokens to get: ID, FORM for token_obj in my_parser.get_tokens(): # (string) identifier of the sentence sent_id = token_obj.get_sent() if sent_id in sentences: sentence = sentences[sent_id] else: sentence = Sentence(sent_id=sent_id) sentences[sent_id] = sentence # (string) number of the token in the sentence, starting at '1' token_id = '{}'.format(len(sentence) + 1) # ID new_token = Token([ token_id, # ID token_obj.get_text(), # FORM '_', # LEMMA '_', # UPOS '_', # XPOS '_', # FEATS '0', # HEAD -> to be overwritten later 'root', # DEPREL -> to be overwritten later '_', # DEPS '_' # MISC ]) sentence.add(new_token) # to match a NAF span to conll tokens, we need sent_id and token_id naf2conll_id[token_obj.get_id()] = { 'sent_id': sent_id, 'token_id': token_id } # iterate over the term to get: LEMMA, XPOS, UPOS, FEATS, sent_id, nafid for term_obj in my_parser.get_terms(): # span # TODO: for now, assume terms map one-on-one on tokens nafid = term_obj.get_span().get_span_ids() if len(nafid) > 1: logging.error('Multi-word tokens not implemented yet.') return nafid = nafid[0] conllid = naf2conll_id[nafid] sent_id = conllid['sent_id'] sentence = sentences[sent_id] token_id = conllid['token_id'] token = sentence[token_id] # store the identifier of the NAF term on the token, so we can add # information to the NAF later. token.nafid = term_obj.get_id() token.LEMMA = term_obj.get_lemma() # NAF pos='' is in lower case, UD UPOS is upper case token.UPOS = term_obj.get_pos().upper() # naf: A(B,C) -> ud: A|B|C xpos = term_obj.get_morphofeat() if xpos: token.XPOS = xpos.replace('(', '|').replace(')', '').replace(',', '|') if token.XPOS[-1] == '|': token.XPOS = token.XPOS[:-1] # look for an external reference containing FEATS for ext_ref in term_obj.get_external_references(): if ext_ref.get_reftype() == 'FEATS': token.FEATS = ext_ref.get_reference() # to match NAF dependencies to conll tokens, we need sent_id and token_id naf2conll_id[term_obj.get_id()] = { 'sent_id': sent_id, 'token_id': token_id } # iterate over the dependencies to get: HEAD, DEPREL for dep_obj in my_parser.get_dependencies(): # from conllid = naf2conll_id[dep_obj.get_from()] sent_id = conllid['sent_id'] sentence = sentences[sent_id] token_id = conllid['token_id'] token_from = sentence[token_id] # to conllid = naf2conll_id[dep_obj.get_to()] sent_id = conllid['sent_id'] sentence = sentences[sent_id] token_id = conllid['token_id'] token_to = sentence[token_id] # function depfunc = dep_obj.get_function() token_to.HEAD = token_from.ID token_to.DEPREL = depfunc # A final conversion of our list of sentences to a ConlluDataset for sent_id in sentences: sentence = sentences[sent_id] # construct the sentence.full_text raw_tokens = [] for token in sentence: raw_tokens.append(token.FORM) sentence.full_text = ' '.join(raw_tokens) # add to the dataset my_dataset.add(sentence) my_dataset.naf2conll_id = naf2conll_id return my_dataset, my_parser