Пример #1
0
def write_freqt(docgraph, output_filepath, include_pos=False):
    """convert a docgraph into a FREQT input file (one sentence per line)."""
    path_to_file = os.path.dirname(output_filepath)
    if not os.path.isdir(path_to_file):
        create_dir(path_to_file)
    with codecs.open(output_filepath, 'w', 'utf-8') as output_file:
        for sentence in docgraph.sentences:
            output_file.write(docgraph2freqt(docgraph, sentence,
                              include_pos=include_pos)+'\n')
Пример #2
0
def write_freqt(docgraph, output_filepath, include_pos=False):
    """convert a docgraph into a FREQT input file (one sentence per line)."""
    path_to_file = os.path.dirname(output_filepath)
    if not os.path.isdir(path_to_file):
        create_dir(path_to_file)
    with codecs.open(output_filepath, 'w', 'utf-8') as output_file:
        for sentence in docgraph.sentences:
            output_file.write(
                docgraph2freqt(docgraph, sentence, include_pos=include_pos) +
                '\n')
Пример #3
0
def write_exb(docgraph, output_file):
    """
    converts a DiscourseDocumentGraph into an Exmaralda ``*.exb`` file and
    writes it to the given file (or file path).
    """
    exmaralda_file = ExmaraldaFile(docgraph)
    assert isinstance(output_file, (str, file))
    if isinstance(output_file, str):
        path_to_file = os.path.dirname(output_file)
        if not os.path.isdir(path_to_file):
            create_dir(path_to_file)
        exmaralda_file.write(output_file)
    else:  # output_file is a file object
        output_file.write(exmaralda_file.__str__())
Пример #4
0
def write_exb(docgraph, output_file):
    """
    converts a DiscourseDocumentGraph into an Exmaralda ``*.exb`` file and
    writes it to the given file (or file path).
    """
    exmaralda_file = ExmaraldaFile(docgraph)
    assert isinstance(output_file, (str, file))
    if isinstance(output_file, str):
        path_to_file = os.path.dirname(output_file)
        if not os.path.isdir(path_to_file):
            create_dir(path_to_file)
        exmaralda_file.write(output_file)
    else:  # output_file is a file object
        output_file.write(exmaralda_file.__str__())
Пример #5
0
def write_brat(pocores, output_dir):
    create_dir(output_dir)
    doc_name = os.path.basename(pocores.document.name)
    with codecs.open(os.path.join(output_dir, doc_name + '.txt'),
                     'wb',
                     encoding='utf-8') as txtfile:
        txtfile.write(dg.get_text(pocores.document))
    with codecs.open(os.path.join(output_dir, 'annotation.conf'),
                     'wb',
                     encoding='utf-8') as annotation_conf:
        annotation_conf.write(create_annotation_conf(pocores))
    with codecs.open(os.path.join(output_dir, 'visual.conf'),
                     'wb',
                     encoding='utf-8') as visual_conf:
        visual_conf.write(create_visual_conf(pocores))
    with codecs.open(os.path.join(output_dir, doc_name + '.ann'),
                     'wb',
                     encoding='utf-8') as annfile:
        annfile.write(brat_output(pocores))
Пример #6
0
def write_conll(docgraph, output_file, coreference_layer=None,
                markable_layer=None):
    """
    converts a DiscourseDocumentGraph into a tab-separated CoNLL 2009 file and
    writes it to the given file (or file path).
    """
    if markable_layer is None:
        markable_layer = docgraph.ns+':markable'
    conll_file = Conll2009File(docgraph,
                               coreference_layer=coreference_layer,
                               markable_layer=markable_layer)
    assert isinstance(output_file, (str, file))
    if isinstance(output_file, str):
        path_to_file = os.path.dirname(output_file)
        if not os.path.isdir(path_to_file):
            create_dir(path_to_file)
        conll_file.write(output_file)
    else:  # output_file is a file object
        output_file.write(conll_file.__str__())
Пример #7
0
def write_paula(docgraph, output_root_dir, human_readable=False):
    """
    converts a DiscourseDocumentGraph into a set of PAULA XML files
    representing the same document.

    Parameters
    ----------
    docgraph : DiscourseDocumentGraph
        the document graph to be converted
    """
    paula_document = PaulaDocument(docgraph, human_readable=human_readable)
    error_msg = ("Please specify an output directory.\nPaula documents consist"
                 " of multiple files, so we can't just pipe them to STDOUT.")
    assert isinstance(output_root_dir, str), error_msg
    document_dir = os.path.join(output_root_dir, paula_document.name)
    if not os.path.isdir(document_dir):
        create_dir(document_dir)
    for paula_id in paula_document.files:
        with open(os.path.join(document_dir, paula_id+'.xml'), 'w') as outfile:
            outfile.write(
                paula_etree_to_string(paula_document.files[paula_id],
                                      paula_document.file2dtd[paula_id]))
Пример #8
0
def merging_cli(debug=False):
    """
    simple commandline interface of the merging module.

    This function is called when you use the ``discoursegraphs`` application
    directly on the command line.
    """
    parser = argparse.ArgumentParser()

    parser.add_argument('-t', '--tiger-file',
                        help='TigerXML (syntax) file to be merged')
    parser.add_argument('-r', '--rst-file',
                        help='RS3 (rhetorical structure) file to be merged')
    parser.add_argument('-a', '--anaphoricity-file',
                        help='anaphoricity file to be merged')
    parser.add_argument('-c', '--conano-file',
                        help='conano file to be merged')
    parser.add_argument('-m', '--mmax-file',
                        help='MMAX2 file to be merged')
    parser.add_argument(
        '-o', '--output-format', default='dot',
        help=('output format: brackets, brat, dot, pickle, geoff, gexf, graphml, '
              'neo4j, exmaralda, conll, paula, no-output'))
    parser.add_argument('output_file', nargs='?', default=sys.stdout)

    args = parser.parse_args(sys.argv[1:])

    for filepath in (args.tiger_file, args.rst_file, args.anaphoricity_file,
                     args.conano_file):
        if filepath:  # if it was specified on the command line
            assert os.path.isfile(filepath), \
                "File '{}' doesn't exist".format(filepath)

    # create an empty document graph. merge it with other graphs later on.
    discourse_docgraph = DiscourseDocumentGraph()

    if args.tiger_file:
        from discoursegraphs.readwrite.tiger import TigerDocumentGraph
        tiger_docgraph = TigerDocumentGraph(args.tiger_file)
        discourse_docgraph.merge_graphs(tiger_docgraph)

    if args.rst_file:
        rst_graph = dg.read_rs3(args.rst_file)
        discourse_docgraph.merge_graphs(rst_graph)

    if args.anaphoricity_file:
        from discoursegraphs.readwrite import AnaphoraDocumentGraph
        anaphora_graph = AnaphoraDocumentGraph(args.anaphoricity_file)
        discourse_docgraph.merge_graphs(anaphora_graph)
        # the anaphora doc graph only contains trivial edges from its root
        # node.
        try:
            discourse_docgraph.remove_node('anaphoricity:root_node')
        except networkx.NetworkXError as e:  # ignore if the node doesn't exist
            pass

    if args.conano_file:
        from discoursegraphs.readwrite import ConanoDocumentGraph
        conano_graph = ConanoDocumentGraph(args.conano_file)
        discourse_docgraph.merge_graphs(conano_graph)

    if args.mmax_file:
        from discoursegraphs.readwrite import MMAXDocumentGraph
        mmax_graph = MMAXDocumentGraph(args.mmax_file)
        discourse_docgraph.merge_graphs(mmax_graph)

    if isinstance(args.output_file, str):  # if we're not piping to stdout ...
        # we need abspath to handle files in the current directory
        path_to_output_file = \
            os.path.dirname(os.path.abspath(args.output_file))
        if not os.path.isdir(path_to_output_file):
            create_dir(path_to_output_file)

    if args.output_format == 'dot':
        write_dot(discourse_docgraph, args.output_file)
    elif args.output_format == 'brat':
        dg.write_brat(discourse_docgraph, args.output_file)
    elif args.output_format == 'brackets':
        dg.write_brackets(discourse_docgraph, args.output_file)
    elif args.output_format == 'pickle':
        import cPickle as pickle
        with open(args.output_file, 'wb') as pickle_file:
            pickle.dump(discourse_docgraph, pickle_file)
    elif args.output_format in ('geoff', 'neo4j'):
        from discoursegraphs.readwrite.neo4j import write_geoff
        write_geoff(discourse_docgraph, args.output_file)
        print ''  # this is just cosmetic for stdout
    elif args.output_format == 'gexf':
        dg.write_gexf(discourse_docgraph, args.output_file)
    elif args.output_format == 'graphml':
        dg.write_graphml(discourse_docgraph, args.output_file)
    elif args.output_format == 'exmaralda':
        from discoursegraphs.readwrite.exmaralda import write_exb
        write_exb(discourse_docgraph, args.output_file)
    elif args.output_format == 'conll':
        from discoursegraphs.readwrite.conll import write_conll
        write_conll(discourse_docgraph, args.output_file)
    elif args.output_format == 'paula':
        from discoursegraphs.readwrite.paulaxml.paula import write_paula
        write_paula(discourse_docgraph, args.output_file)

    elif args.output_format == 'no-output':
        pass  # just testing if the merging works
    else:
        raise ValueError(
            "Unsupported output format: {}".format(args.output_format))

    if debug:
        print "Merged successfully: ", args.tiger_file
Пример #9
0
def merging_cli(debug=False):
    """
    simple commandline interface of the merging module.

    This function is called when you use the ``discoursegraphs`` application
    directly on the command line.
    """
    parser = argparse.ArgumentParser()

    parser.add_argument('-t',
                        '--tiger-file',
                        help='TigerXML (syntax) file to be merged')
    parser.add_argument('-r',
                        '--rst-file',
                        help='RS3 (rhetorical structure) file to be merged')
    parser.add_argument('-a',
                        '--anaphoricity-file',
                        help='anaphoricity file to be merged')
    parser.add_argument('-c', '--conano-file', help='conano file to be merged')
    parser.add_argument('-m', '--mmax-file', help='MMAX2 file to be merged')
    parser.add_argument(
        '-o',
        '--output-format',
        default='dot',
        help=(
            'output format: brackets, brat, dot, pickle, geoff, gexf, graphml, '
            'neo4j, exmaralda, conll, paula, no-output'))
    parser.add_argument('output_file', nargs='?', default=sys.stdout)

    args = parser.parse_args(sys.argv[1:])

    for filepath in (args.tiger_file, args.rst_file, args.anaphoricity_file,
                     args.conano_file):
        if filepath:  # if it was specified on the command line
            assert os.path.isfile(filepath), \
                "File '{}' doesn't exist".format(filepath)

    # create an empty document graph. merge it with other graphs later on.
    discourse_docgraph = DiscourseDocumentGraph()

    if args.tiger_file:
        from discoursegraphs.readwrite.tiger import TigerDocumentGraph
        tiger_docgraph = TigerDocumentGraph(args.tiger_file)
        discourse_docgraph.merge_graphs(tiger_docgraph)

    if args.rst_file:
        rst_graph = dg.read_rs3(args.rst_file)
        discourse_docgraph.merge_graphs(rst_graph)

    if args.anaphoricity_file:
        from discoursegraphs.readwrite import AnaphoraDocumentGraph
        anaphora_graph = AnaphoraDocumentGraph(args.anaphoricity_file)
        discourse_docgraph.merge_graphs(anaphora_graph)
        # the anaphora doc graph only contains trivial edges from its root
        # node.
        try:
            discourse_docgraph.remove_node('anaphoricity:root_node')
        except networkx.NetworkXError as e:  # ignore if the node doesn't exist
            pass

    if args.conano_file:
        from discoursegraphs.readwrite import ConanoDocumentGraph
        conano_graph = ConanoDocumentGraph(args.conano_file)
        discourse_docgraph.merge_graphs(conano_graph)

    if args.mmax_file:
        from discoursegraphs.readwrite import MMAXDocumentGraph
        mmax_graph = MMAXDocumentGraph(args.mmax_file)
        discourse_docgraph.merge_graphs(mmax_graph)

    if isinstance(args.output_file, str):  # if we're not piping to stdout ...
        # we need abspath to handle files in the current directory
        path_to_output_file = \
            os.path.dirname(os.path.abspath(args.output_file))
        if not os.path.isdir(path_to_output_file):
            create_dir(path_to_output_file)

    if args.output_format == 'dot':
        write_dot(discourse_docgraph, args.output_file)
    elif args.output_format == 'brat':
        dg.write_brat(discourse_docgraph, args.output_file)
    elif args.output_format == 'brackets':
        dg.write_brackets(discourse_docgraph, args.output_file)
    elif args.output_format == 'pickle':
        import cPickle as pickle
        with open(args.output_file, 'wb') as pickle_file:
            pickle.dump(discourse_docgraph, pickle_file)
    elif args.output_format in ('geoff', 'neo4j'):
        from discoursegraphs.readwrite.neo4j import write_geoff
        write_geoff(discourse_docgraph, args.output_file)
        print ''  # this is just cosmetic for stdout
    elif args.output_format == 'gexf':
        dg.write_gexf(discourse_docgraph, args.output_file)
    elif args.output_format == 'graphml':
        dg.write_graphml(discourse_docgraph, args.output_file)
    elif args.output_format == 'exmaralda':
        from discoursegraphs.readwrite.exmaralda import write_exb
        write_exb(discourse_docgraph, args.output_file)
    elif args.output_format == 'conll':
        from discoursegraphs.readwrite.conll import write_conll
        write_conll(discourse_docgraph, args.output_file)
    elif args.output_format == 'paula':
        from discoursegraphs.readwrite.paulaxml.paula import write_paula
        write_paula(discourse_docgraph, args.output_file)

    elif args.output_format == 'no-output':
        pass  # just testing if the merging works
    else:
        raise ValueError("Unsupported output format: {}".format(
            args.output_format))

    if debug:
        print "Merged successfully: ", args.tiger_file
Пример #10
0
def run_pocores(input_file,
                input_format,
                output_dest=None,
                output_format='bracketed',
                weights=WEIGHTS,
                max_sent_dist=MAX_SENT_DIST,
                debug=False,
                eval_file=None):
    """
    run the pocores coreference system on a mate-parsed, CoNLL-formatted
    input file.
    """
    assert input_format in ('2009', '2010')
    assert output_format in ('bracketed', 'brat', 'xml')

    if input_format == '2009':
        docgraph = dg.read_conll(input_file,
                                 conll_format=input_format,
                                 deprel_attr='pdeprel',
                                 feat_attr='pfeat',
                                 head_attr='phead',
                                 lemma_attr='plemma',
                                 pos_attr='ppos')
    else:  # conll 2010 format
        docgraph = dg.read_conll(input_file,
                                 conll_format=input_format,
                                 deprel_attr='pdeprel',
                                 feat_attr='pfeat',
                                 head_attr='phead',
                                 lemma_attr='lemma',
                                 pos_attr='ppos')

    pocores = Pocores(docgraph)
    pocores.resolve_anaphora(weights, max_sent_dist, debug=debug)
    pocores.add_coreference_chains_to_docgraph()

    if output_format == 'bracketed':
        if isinstance(output_dest, file):
            output_dest.write(output_with_brackets(pocores))
        else:
            path_to_dir, _filename = os.path.split(output_dest)
            create_dir(path_to_dir)
            with codecs.open(output_dest, 'w', 'utf-8') as output_file:
                output_file.write(output_with_brackets(pocores))
    elif output_format == 'xml':
        if isinstance(output_dest, file):
            output_dest.write(make_xml(pocores))
        else:
            path_to_dir, _filename = os.path.split(output_dest)
            create_dir(path_to_dir)
            with codecs.open(output_dest, 'w', 'utf-8') as output_file:
                output_file.write(make_xml(pocores))
    else:  # 'brat'
        if not isinstance(output_dest, file):
            # output_dest will be treated as a directory
            write_brat(pocores, output_dest)
        else:
            sys.stderr.write('For brat output specify an output folder.\n')
            sys.exit(1)

    if debug:
        print_coreference_report(pocores)

    if eval_file:
        # TODO: implement proper scorer.pl-based evaluation
        # there's some useful code in the /var/local/git/Depot/coreference.git
        # repo on hebe
        raise NotImplementedError

    return pocores