Exemplo n.º 1
0
def read_corenlp_doc(filename, verbose=True):
    if verbose:
        log.info('Reading CoreNLP document from {}'.format(filename))

    input_xml = smart_file_handler(filename)

    xml_parser = etree.XMLParser(target=CoreNLPTarget())
    sents, corefs = etree.parse(input_xml, xml_parser)
    doc_name = splitext(basename(filename))[0]
    doc = Document.construct(doc_name, sents, corefs)

    input_xml.close()

    return doc
Exemplo n.º 2
0
def read_doc_from_ontonotes(coref_doc, name_doc, verbose=True):
    doc_id = coref_doc.document_id.split('@')[0]
    assert doc_id == name_doc.document_id.split('@')[0], \
        '{} and {} do not have the same document_id'.format(coref_doc, name_doc)

    if verbose:
        log.info('Reading ontonotes document {}'.format(doc_id))

    conll_file_path = join(ontonotes_annotations_source, doc_id + '.depparse')

    all_sents = read_conll_depparse(conll_file_path)

    all_corefs = read_coref_doc(coref_doc)

    doc_name = doc_id.split('/')[-1]
    doc = Document.construct(doc_name, all_sents, all_corefs)

    for name_entity in read_name_doc(name_doc):
        add_name_entity_to_doc(doc, name_entity)

    return doc