Exemplo n.º 1
0
    def convert(self, id, result, format):
        assert format == "csv"

        _int = lambda x: None if x is None else int(x)
        naf = KafNafParser(BytesIO(result.encode("utf-8")))

        deps = {dep.get_to(): (dep.get_function(), dep.get_from())
                for dep in naf.get_dependencies()}
        tokendict = {token.get_id(): token for token in naf.get_tokens()}

        s = StringIO()
        w = csv.writer(s)
        w.writerow(["id", "token_id", "offset", "sentence", "para", "word", "term_id",
                    "lemma", "pos", "pos1", "parent", "relation"])
        for term in naf.get_terms():
            tokens = [tokendict[id] for id in term.get_span().get_span_ids()]
            for token in tokens:
                tid = term.get_id()
                pos = term.get_pos()
                pos1 = POSMAP[pos]
                row = [id,  token.get_id(), _int(token.get_offset()), _int(token.get_para()), token.get_text(),
                       tid, term.get_lemma(), pos, pos1]
                if tid in deps:
                    rel, parent = deps[tid]
                    row += [parent, rel.split("/")[-1]]
                else:
                    row += [None, None]
                w.writerow(row)
        return s.getvalue()
Exemplo n.º 2
0
def test_corenlp2naf():
    xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read()
    naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER)
    naf = KafNafParser(BytesIO(naf_bytes))
    terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()}
    assert_equal(set(terms.values()), {"John", "attack", "I", "in", "London", "hit", "he", "back", "."})
    london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0]
    assert_equal(london.get_pos(), 'R')
    assert_equal(london.get_morphofeat(), 'NNP')

    ents = {}
    for e in naf.get_entities():
        for ref in e.get_references():
            for term_id in ref.get_span().get_span_ids():
                ents[terms[term_id]] = e.get_type()
    assert_equal(ents, {"John": "PERSON", "London": "LOCATION"})

    deps = {terms[d.get_from()]: (d.get_function(), terms[d.get_to()])
            for d in naf.get_dependencies()}
    expected = {'I': ('nsubj', 'hit'),
                'John': ('nsubj', 'attack'),
                'London': ('prep_in', 'attack'),
                'back': ('advmod', 'hit'),
                'he': ('dobj', 'hit')}
    assert_equal(deps, expected)

    corefs = []
    for coref in naf.get_corefs():
        corefs.append(set())
        for span in coref.get_spans():
            corefs[-1] |= {terms[t] for t in span.get_span_ids()}
    assert_in({"John", "he"}, corefs)
Exemplo n.º 3
0
    def from_naf(self, article, naf):
        def _int(x):
            return None if x is None else int(x)
        naf = KafNafParser(BytesIO(naf.encode("utf-8")))

        deps = {dep.get_to(): (dep.get_function(), dep.get_from())
                for dep in naf.get_dependencies()}
        tokendict = {token.get_id(): token for token in naf.get_tokens()}

        for term in naf.get_terms():
            tokens = [tokendict[id] for id in term.get_span().get_span_ids()]
            for token in tokens:
                tid = term.get_id()
                tok = {"aid": article,
                       "token_id": token.get_id(),
                       "offset": _int(token.get_offset()),
                       "sentence": _int(token.get_sent()),
                       "para": _int(token.get_para()),
                       "word": token.get_text(),
                       "term_id": tid,
                       "lemma": term.get_lemma(),
                       "pos": term.get_pos()}
                if tid in deps:
                    rel, parent = deps[tid]
                    tok['parent'] = parent
                    tok['relation'] = rel.split("/")[-1]
                yield tok
Exemplo n.º 4
0
def test_corenlp2naf():
    xml = open(os.path.join(os.path.dirname(__file__),
                            "test_corenlp.xml")).read()
    naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER)
    naf = KafNafParser(BytesIO(naf_bytes))
    terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()}
    assert_equal(
        set(terms.values()),
        {"John", "attack", "I", "in", "London", "hit", "he", "back", "."})
    london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0]
    assert_equal(london.get_pos(), 'R')
    assert_equal(london.get_morphofeat(), 'NNP')

    ents = {}
    for e in naf.get_entities():
        for ref in e.get_references():
            for term_id in ref.get_span().get_span_ids():
                ents[terms[term_id]] = e.get_type()
    assert_equal(ents, {"John": "PERSON", "London": "LOCATION"})

    deps = {
        terms[d.get_from()]: (d.get_function(), terms[d.get_to()])
        for d in naf.get_dependencies()
    }
    expected = {
        'I': ('nsubj', 'hit'),
        'John': ('nsubj', 'attack'),
        'London': ('prep_in', 'attack'),
        'back': ('advmod', 'hit'),
        'he': ('dobj', 'hit')
    }
    assert_equal(deps, expected)

    corefs = []
    for coref in naf.get_corefs():
        corefs.append(set())
        for span in coref.get_spans():
            corefs[-1] |= {terms[t] for t in span.get_span_ids()}
    assert_in({"John", "he"}, corefs)
def _test_file(this_file):
    input_fd = open(this_file)
    
    result = subprocess.check_output(os.path.join(__here__,'run_parser.sh'), stdin=input_fd)
    my_obj = KafNafParser(BytesIO(result))
       
    
    #Check the terms
    terms = [term for term in my_obj.get_terms()]
    assert_equal(len(terms),12)
    assert_equal(my_obj.get_term('t_4').get_lemma(),'mooi')
    assert_equal(my_obj.get_term('t_4').get_pos(),'adj')
    
    
    #Check constituents
    trees = [tree for tree in my_obj.get_trees()]
    assert_equal(len(trees),2)
    assert_equal(trees[0].get_terminals_as_list()[1].get_span().get_span_ids(),['t_1'])
    
    #Check dependencies
    dependencies = [dep for dep in my_obj.get_dependencies()]
    assert_equal(len(dependencies),10)
    assert_equal(dependencies[5].get_function(),'hd/su')
input = sys.stdin

my_parser = KafNafParser(input)
	 
### We first need a list of the predicates that we want to create feature vectors for
predicates = {} 
for term_obj in my_parser.get_terms():
	predicate = re.match("WW", term_obj.get_morphofeat())
	if predicate is not None:
		predicates[term_obj.get_id()] = term_obj.get_pos()
		#print term_obj.get_id(), term_obj.get_morphofeat(), term_obj.get_lemma()

# We need the dependencies to find out the structure of the argument patterns
# and also to know which verbs are auxiliary verbs and which ones are main verbs
dependencies = {} 
for dep_obj in my_parser.get_dependencies():
	relparts = dep_obj.get_function().split('/')
	rel_from = relparts[0]
	rel_to = relparts[1]
	dep_id = dep_obj.get_from() + '-' + dep_obj.get_to() 
	dependencies[dep_id] = dep_obj.get_function()

# We also want to distinguish between main verbs and auxiliary verbs
# for this we first gather all dependency patterns and store those 
dep_patterns = {} 		
for deps in dependencies:
	dep_parts = deps.split('-')
	#dep_patterns[dep_parts[0]] = [] 
	func_parts = dependencies[deps].split('/')
	if predicates.get(dep_parts[0]) is not None:
		if dependencies[deps] != '-- / --':
Exemplo n.º 7
0
def load_naf_stdin():
    """Load a dataset in NAF format.

    Use this function to create a new ConlluDataset from a NAF file,
    read from stdin.

    NOTE: you can only add to NAF files, not create one from scratch.
    """
    my_parser = KafNafParser(sys.stdin)

    my_dataset = ConlluDataset()

    # a big look-up table: for any NAF id, return a hash with
    # {sent_id, token_id} in the ConlluDataset
    naf2conll_id = {}

    # collect the sentences in a hash, indexed by token_obj.get_sent()
    sentences = {}

    # iterate over the tokens to get: ID, FORM
    for token_obj in my_parser.get_tokens():
        # (string) identifier of the sentence
        sent_id = token_obj.get_sent()
        if sent_id in sentences:
            sentence = sentences[sent_id]
        else:
            sentence = Sentence(sent_id=sent_id)
            sentences[sent_id] = sentence

        # (string) number of the token in the sentence, starting at '1'
        token_id = '{}'.format(len(sentence) + 1)  # ID

        new_token = Token([
            token_id,  # ID
            token_obj.get_text(),  # FORM
            '_',  # LEMMA
            '_',  # UPOS
            '_',  # XPOS
            '_',  # FEATS
            '0',  # HEAD -> to be overwritten later
            'root',  # DEPREL -> to be overwritten later
            '_',  # DEPS
            '_'  # MISC
        ])

        sentence.add(new_token)

        # to match a NAF span to conll tokens, we need sent_id and token_id
        naf2conll_id[token_obj.get_id()] = {
            'sent_id': sent_id,
            'token_id': token_id
        }

    # iterate over the term to get: LEMMA, XPOS, UPOS, FEATS, sent_id, nafid
    for term_obj in my_parser.get_terms():
        # span
        # TODO: for now, assume terms map one-on-one on tokens
        nafid = term_obj.get_span().get_span_ids()
        if len(nafid) > 1:
            logging.error('Multi-word tokens not implemented yet.')
            return
        nafid = nafid[0]

        conllid = naf2conll_id[nafid]
        sent_id = conllid['sent_id']
        sentence = sentences[sent_id]

        token_id = conllid['token_id']
        token = sentence[token_id]

        # store the identifier of the NAF term on the token, so we can add
        # information to the NAF later.
        token.nafid = term_obj.get_id()

        token.LEMMA = term_obj.get_lemma()

        # NAF pos='' is in lower case, UD UPOS is upper case
        token.UPOS = term_obj.get_pos().upper()

        # naf: A(B,C) -> ud: A|B|C
        xpos = term_obj.get_morphofeat()
        if xpos:
            token.XPOS = xpos.replace('(', '|').replace(')',
                                                        '').replace(',', '|')
            if token.XPOS[-1] == '|':
                token.XPOS = token.XPOS[:-1]

        # look for an external reference containing FEATS
        for ext_ref in term_obj.get_external_references():
            if ext_ref.get_reftype() == 'FEATS':
                token.FEATS = ext_ref.get_reference()

        # to match NAF dependencies to conll tokens, we need sent_id and token_id
        naf2conll_id[term_obj.get_id()] = {
            'sent_id': sent_id,
            'token_id': token_id
        }

    # iterate over the dependencies to get: HEAD, DEPREL
    for dep_obj in my_parser.get_dependencies():
        # from
        conllid = naf2conll_id[dep_obj.get_from()]
        sent_id = conllid['sent_id']
        sentence = sentences[sent_id]

        token_id = conllid['token_id']
        token_from = sentence[token_id]

        # to
        conllid = naf2conll_id[dep_obj.get_to()]
        sent_id = conllid['sent_id']
        sentence = sentences[sent_id]

        token_id = conllid['token_id']
        token_to = sentence[token_id]

        # function
        depfunc = dep_obj.get_function()

        token_to.HEAD = token_from.ID
        token_to.DEPREL = depfunc

    # A final conversion of our list of sentences to a ConlluDataset
    for sent_id in sentences:
        sentence = sentences[sent_id]

        # construct the sentence.full_text
        raw_tokens = []
        for token in sentence:
            raw_tokens.append(token.FORM)
        sentence.full_text = ' '.join(raw_tokens)

        # add to the dataset
        my_dataset.add(sentence)

    my_dataset.naf2conll_id = naf2conll_id

    return my_dataset, my_parser