示例#1
0
def main():
    parser = ArgumentParser()
    # required positional
    parser.add_argument('graph_path', help='path to the graph JSON file')
    parser.add_argument('hypothesis_path',
                        help='path to the JSON file with hypotheses')

    args = parser.parse_args()

    # read KB
    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    # read hypotheses
    hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses')
    hypothesis_collection = AidaHypothesisCollection.from_json(
        hypotheses_json, json_graph)

    analysis_obj = defaultdict(list)

    for hyp in hypothesis_collection:
        analysis_obj = hyp_stats(hyp, analysis_obj, json_graph)

    # for idx in range(len(analysis_obj["stmts"])):
    #     print("-----------Hypothesis", idx, "-------")
    #     for key, val in analysis_obj.items():
    #         print(key, ":", val[idx])

    print("================ Overall =============")
    for key, val in analysis_obj.items():
        print(key, round(sum(val) / len(val), 2))
示例#2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('graph_file', help='path to the graph json file')
    parser.add_argument('seed_file', help='path to the hypothesis seed file')
    parser.add_argument('output_dir', help='path to the output directory')
    parser.add_argument('--max_num_hops',
                        type=int,
                        default=None,
                        help='maximum number of hops to extend from')
    parser.add_argument(
        '--min_num_eres',
        type=int,
        default=None,
        help='minimum number of EREs in the subgraph to stop extending')
    parser.add_argument(
        '--min_num_stmts',
        type=int,
        default=None,
        help='minimum number of statements in the subgraph to stop extending')
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='print more details in each hop of extraction')
    parser.add_argument(
        '--force',
        '-f',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    graph_json = util.read_json_file(args.graph_file, 'JSON graph')
    seed_json = util.read_json_file(args.seed_file, 'hypothesis seeds')

    max_num_hops = args.max_num_hops
    min_num_eres = args.min_num_eres
    min_num_stmts = args.min_num_stmts

    if not (max_num_hops or min_num_eres or min_num_stmts):
        raise RuntimeError(
            'Must specify at least one of "max_num_hops", "min_num_eres", and '
            '"min_num_stmts"')

    for hypothesis_idx, (prob, hypothesis) in enumerate(
            zip(seed_json['probs'], seed_json['support'])):
        subgraph = extract_subgraph(index=hypothesis_idx,
                                    graph=graph_json,
                                    statements=hypothesis['statements'],
                                    max_num_hops=max_num_hops,
                                    min_num_eres=min_num_eres,
                                    min_num_stmts=min_num_stmts,
                                    verbose=args.verbose)
        output_path = output_dir / f'subgraph_{hypothesis_idx}.json'
        with open(str(output_path), 'w') as fout:
            json.dump(subgraph, fout, indent=2)
示例#3
0
def main():
    parser = ArgumentParser()
    parser.add_argument('input_graph_path',
                        help='path to the input graph json file')
    parser.add_argument('output_graph_path',
                        help='path to write the coref-compressed graph')
    parser.add_argument('output_log_path', help='path to write the log file')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    output_graph_path = util.get_output_path(args.output_graph_path,
                                             overwrite_warning=not args.force)
    output_log_path = util.get_output_path(args.output_log_path,
                                           overwrite_warning=not args.force)

    input_json_graph = JsonGraph.from_dict(
        util.read_json_file(args.input_graph_path, 'JSON graph'))

    num_old_eres = len(list(input_json_graph.each_ere()))
    assert num_old_eres == len(input_json_graph.eres)
    num_old_stmts = len(list(input_json_graph.each_statement()))
    logging.info(
        'Found {} EREs and {} statements in the original graph'.format(
            num_old_eres, num_old_stmts))

    mappings = build_mappings(input_json_graph)

    output_json_graph = JsonGraph()

    num_new_eres = compress_eres(input_json_graph, mappings, output_json_graph)
    num_new_stmts = compress_statements(input_json_graph, mappings,
                                        output_json_graph)

    logging.info(
        'Finished coref-compressed graph with {} EREs and {} statements'.
        format(num_new_eres, num_new_stmts))

    logging.info(
        'Writing compressed json graph to {}'.format(output_graph_path))
    with open(str(output_graph_path), 'w') as fout:
        json.dump(output_json_graph.as_dict(), fout, indent=1)

    log_json = {}
    for mapping_key, mapping in mappings.items():
        if 'key' in mapping_key:
            continue
        if mapping_key.endswith('s'):
            log_json[mapping_key] = {k: list(v) for k, v in mapping.items()}
        else:
            log_json[mapping_key] = mapping

    logging.info('Writing compression log to {}'.format(output_log_path))
    with open(str(output_log_path), 'w') as fout:
        json.dump(log_json, fout, indent=2)
def main():
    parser = ArgumentParser()

    parser.add_argument('graph_path', help='Path to the input graph JSON file')
    parser.add_argument('hypotheses_path',
                        help='Path to the raw hypotheses file, or a directory with multiple files')
    parser.add_argument('output_dir',
                        help='Directory to write the filtered hypothesis files(s)')
    parser.add_argument('-f', '--force', action='store_true', default=False,
                        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force)

    json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph'))
    hypotheses_file_paths = util.get_file_list(args.hypotheses_path, suffix='.json', sort=True)

    for hypotheses_file_path in hypotheses_file_paths:
        hypotheses_json = util.read_json_file(hypotheses_file_path, 'hypotheses')
        hypothesis_collection = AidaHypothesisCollection.from_json(hypotheses_json, json_graph)

        hypothesis_collection.expand()

        # create the filter
        hypothesis_filter = AidaHypothesisFilter(json_graph)

        filtered_hyplist = [hypothesis_filter.filtered(hypothesis) for hypothesis in hypothesis_collection\
                 if not hypothesis_too_short(hypothesis, json_graph)]

        filtered_hypothesis_collection = AidaHypothesisCollection(compactify(filtered_hyplist, json_graph))

        filtered_hypotheses_json = filtered_hypothesis_collection.to_json()

        # add graph filename and queries, if they were there before
        if 'graph' in hypotheses_json:
            filtered_hypotheses_json['graph'] = hypotheses_json['graph']
        if "queries" in hypotheses_json:
            filtered_hypotheses_json['queries'] = hypotheses_json['queries']

        output_path = output_dir / hypotheses_file_path.name
        logging.info('Writing filtered hypotheses to {} ...'.format(output_path))

        with open(str(output_path), 'w') as fout:
            json.dump(filtered_hypotheses_json, fout, indent=1)
示例#5
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph JSON file')
    parser.add_argument('hypothesis_path', help='path to the JSON file with hypotheses')
    parser.add_argument('roles_ontology_path', help='path to the roles ontology file')
    parser.add_argument('output_dir', help='directory to write human-readable hypotheses')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph'))

    hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses')
    hypothesis_collection = AidaHypothesisCollection.from_json(hypotheses_json, json_graph)

    roles_ontology = util.read_json_file(args.roles_ontology_path, 'roles ontology')

    output_dir = util.get_output_dir(args.output_dir, overwrite_warning=True)

    for idx, hypothesis in enumerate(hypothesis_collection.hypotheses):
        output_path = output_dir / 'hypothesis-{:0>3d}.txt'.format(idx)
        with open(str(output_path), "w", encoding="utf-8") as fout:
            print(hypothesis.to_str(roles_ontology), file=fout)
示例#6
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='Path to the input graph JSON file')
    parser.add_argument(
        'raw_seeds_path',
        help='Path to the raw hypothesis seeds file, or a directory with '
        'multiple seeds files')
    parser.add_argument(
        'output_dir', help='Directory to write the reranked hypothesis seeds')
    parser.add_argument('--plausibility_model_path',
                        help='Path to a hypothesis plausibility model')
    parser.add_argument('--indexer_path', help="Path to the indexers file")
    parser.add_argument('-n',
                        '--max_num_seeds',
                        type=int,
                        default=None,
                        help='Only output up to n hypothesis seeds')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    raw_seeds_file_paths = util.get_file_list(args.raw_seeds_path,
                                              suffix='.json',
                                              sort=True)

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    for raw_seeds_file_path in raw_seeds_file_paths:
        raw_seeds_json = util.read_json_file(raw_seeds_file_path,
                                             'seeds by facet')
        seeds_by_facet = {}
        for facet_label, seeds_json in raw_seeds_json.items():
            if facet_label != 'graph':
                seeds_by_facet[facet_label] = [
                    HypothesisSeed.from_json(seed_json, json_graph)
                    for seed_json in seeds_json
                ]

        if args.plausibility_model_path is not None and args.indexer_path is not None:
            seeds_by_facet = rerank_seeds_by_plausibility(
                seeds_by_facet, args.graph_path, args.plausibility_model_path,
                args.indexer_path)

        seeds = select_seeds_by_novelty(seeds_by_facet, args.max_num_seeds)

        hypotheses_to_export = []

        # turn ranks into the log weights of seed hypotheses
        # meaningless numbers. just assign 1/2, 1/3, 1/4, ...
        for rank, seed in enumerate(seeds):
            seed.hypothesis.update_weight(math.log(1.0 / (rank + 1)))
            hypotheses_to_export.append(seed.finalize())

        hypothesis_collection = AidaHypothesisCollection(hypotheses_to_export)

        seeds_json = hypothesis_collection.to_json()
        seeds_json['graph'] = raw_seeds_json['graph']

        output_path = output_dir / (raw_seeds_file_path.name.split('_')[0] +
                                    '_seeds.json')
        logging.info(
            'Writing re-ranked hypothesis seeds to {} ...'.format(output_path))
        with open(str(output_path), 'w') as fout:
            json.dump(seeds_json, fout, indent=1)
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph json file')
    parser.add_argument('hypotheses_path',
                        help='path to the hypotheses json file')
    parser.add_argument('output_dir', help='Directory to write queries')
    parser.add_argument('--top',
                        default=50,
                        type=int,
                        help='number of top hypothesis to output')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))
    mappings = json_graph.build_cluster_member_mappings()

    hypotheses_json = util.read_json_file(args.hypotheses_path, 'hypotheses')

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    top_count = 0

    for result_idx, prob in sorted(enumerate(hypotheses_json['probs']),
                                   key=itemgetter(1),
                                   reverse=True):
        hypothesis = hypotheses_json['support'][result_idx]

        top_count += 1

        update_str = update_prefix + 'INSERT DATA\n{\n'

        prototype_handles = compute_handle_mapping(
            json_graph,
            hypothesis,
            member_to_clusters=mappings['member_to_clusters'],
            cluster_to_prototype=mappings['cluster_to_prototype'])

        for prototype, handle in prototype_handles.items():
            handle = handle.lstrip('"')
            handle = handle.rstrip('"')
            update_str += '  <{}> aida:handle "{}" .\n'.format(
                prototype, handle)

        update_str += '}'

        output_path = output_dir / 'hypothesis-{:0>3d}-update.rq'.format(
            top_count)

        with open(str(output_path), 'w') as fout:
            fout.write(update_str)

        if top_count >= args.top:
            break
示例#8
0
import sys
from pathlib import Path

from aida_utexas import util

seeds_dir = Path(sys.argv[1]).resolve()
assert seeds_dir.exists() and seeds_dir.is_dir(), \
    '{} does not exist!'.format(seeds_dir)

seeds_file_list = sorted(
    [f for f in seeds_dir.iterdir() if f.suffix == '.json'])

for seeds_file in seeds_file_list:
    seeds_json = util.read_json_file(seeds_file, 'seeds')
    num_failed_queries_list = []
    for cluster_idx, cluster in enumerate(seeds_json['support']):
        num_failed_queries = len(cluster['failedQueries'])
        num_failed_queries_list.append(num_failed_queries)

        num_query_stmts = len(cluster['queryStatements'])
        num_stmts = len(cluster['statements'])
        num_stmts_zero_weight = len([
            stmt_weight for stmt_weight in cluster['statementWeights']
            if stmt_weight == 0
        ])
        num_stmts_negative_weight = len([
            stmt_weight for stmt_weight in cluster['statementWeights']
            if stmt_weight == -100
        ])
        print(
            'Cluster #{}: # query stmts = {}, # stmts = {}, # stmt (0) = {}, '
示例#9
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph json file')
    parser.add_argument('hypotheses_path', help='path to the hypotheses json file')
    parser.add_argument('db_dir', help='directory with copies of tdb databases')
    parser.add_argument('output_dir', help='path to output directory')
    parser.add_argument('--top', default=50, type=int,

                        help='number of top hypothesis to output')
    parser.add_argument('--dry_run', action='store_true',
                        help='if specified, only write the SPARQL queries to '
                             'files, without actually executing the queries')
    parser.add_argument('--query_just', action='store_true')
    parser.add_argument('--query_conf', action='store_true')
    parser.add_argument('-f', '--force', action='store_true', default=False,
                        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph'))

    mappings = json_graph.build_cluster_member_mappings()
    member_to_clusters = mappings['member_to_clusters']
    cluster_to_prototype = mappings['cluster_to_prototype']
    prototype_set = set(mappings['prototype_to_clusters'].keys())

    hypotheses_json = util.read_json_file(args.hypotheses_path, 'hypotheses')

    output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force)

    db_dir = util.get_input_path(args.db_dir)
    db_path_list = [str(path) for path in sorted(db_dir.glob('copy*'))]
    print('Using the following tdb databases to query: {}'.format(db_path_list))

    num_node_queries = len(db_path_list)

    top_count = 0
    for result_idx, prob in sorted(
            enumerate(hypotheses_json['probs']), key=itemgetter(1), reverse=True):
        hypothesis = hypotheses_json['support'][result_idx]
        # node_query_list, stmt_query_list, just_query_list, conf_query_list = \
        sparql_query_str = \
            queries_for_aida_result(
                json_graph=json_graph,
                hypothesis=hypothesis,
                member_to_clusters=member_to_clusters,
                cluster_to_prototype=cluster_to_prototype,
                prototype_set=prototype_set,
                num_node_queries=num_node_queries,
                query_just=args.query_just,
                query_conf=args.query_conf)

        top_count += 1

        print(f'Writing queries for hypothesis #{top_count} with prob {prob}')

        sparql_query_path = output_dir / 'hypothesis-{:0>3d}-query.rq'.format(top_count)
        with open(str(sparql_query_path), 'w') as fout:
            fout.write(sparql_query_str + '\n')

        if not args.dry_run:
            query_result_path = output_dir / 'hypothesis-{:0>3d}-raw.ttl'.format(top_count)
            query_cmd = 'echo "query {0}"; tdbquery --loc {1} --query {0} > {2}; '.format(
                sparql_query_path, db_path_list[0], query_result_path)

            print('Executing queries ...')
            process = subprocess.Popen(query_cmd, shell=True)
            process.wait()

        # sparql_helper.execute_sparql_queries(
        #     node_query_list, stmt_query_list, just_query_list, conf_query_list,
        #     db_path_list, output_dir,
        #     filename_prefix='hypothesis-{:0>3d}'.format(top_count),
        #     header_prefixes=AIF_HEADER_PREFIXES, dry_run=args.dry_run)

        if top_count >= args.top:
            break
示例#10
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='Path to the input graph JSON file')
    parser.add_argument(
        'query_path',
        help=
        'Path to the input query file, or a directory with multiple queries')
    parser.add_argument('output_dir',
                        help='Directory to write the raw hypothesis seeds')
    parser.add_argument(
        '-n',
        '--max_num_seeds_per_facet',
        type=int,
        default=None,
        help='If provided, only save up to <arg> seeds per facet')
    parser.add_argument(
        '-d',
        '--discard_failed_core_constraints',
        action='store_true',
        help='If specified, discard hypotheses with failed core constraints. '
        'Try not to use this one during evaluation at first, so that we '
        'do not discard hypotheses we might still need. If we have too many '
        'hypotheses and the script runs too slowly, then use this.')
    parser.add_argument(
        '-r',
        '--rank_cutoff',
        type=int,
        default=100,
        help=
        'If specified, discard hypotheses early if there are at least <arg> '
        'other hypotheses that have the same fillers for a certain number '
        '(default = 3) of their non-entrypoint query variables. We might '
        'need this in the evaluation if some facets have many variables '
        'that lead to combinatorial explosion.')
    parser.add_argument(
        '--frame_grouping',
        action='store_true',
        help=
        'If specified, group query constraints by frames instead of by facets')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    query_file_paths = util.get_file_list(args.query_path,
                                          suffix='.json',
                                          sort=True)

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    for query_file_path in query_file_paths:
        query_json = util.read_json_file(query_file_path, 'query')

        raw_seeds_json = make_cluster_seeds(
            json_graph=json_graph,
            query_json=query_json,
            max_num_seeds_per_facet=args.max_num_seeds_per_facet,
            frame_grouping=args.frame_grouping,
            discard_failed_core_constraints=args.
            discard_failed_core_constraints,
            rank_cutoff=args.rank_cutoff)

        # write hypotheses out in json format.
        output_path = output_dir / (query_file_path.name.split('_')[0] +
                                    '_seeds.json')
        logging.info(
            'Writing raw hypothesis seeds of each facet to {} ...'.format(
                output_path))
        with open(str(output_path), 'w') as fout:
            json.dump(raw_seeds_json, fout, indent=1)
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph json file')
    parser.add_argument('hypotheses_path',
                        help='path to the hypotheses json file')
    parser.add_argument('output_dir', help='Directory to write queries')
    parser.add_argument('frame_id', help='Frame ID of the hypotheses')
    parser.add_argument('--top',
                        default=50,
                        type=int,
                        help='number of top hypothesis to output')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))
    mappings = json_graph.build_cluster_member_mappings()

    hypotheses_json = util.read_json_file(args.hypotheses_path, 'hypotheses')

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)
    frame_id = args.frame_id

    top_count = 0

    for result_idx, prob in sorted(enumerate(hypotheses_json['probs']),
                                   key=itemgetter(1),
                                   reverse=True):
        if prob <= 0.0:
            hyp_weight = math.exp(prob / 2.0)
        else:
            hyp_weight = 0.0001

        hypothesis = hypotheses_json['support'][result_idx]

        top_count += 1

        hypothesis_id = '{}_hypothesis_{:0>3d}'.format(frame_id, top_count)

        hypothesis_name = 'utexas:{}'.format(hypothesis_id)
        subgraph_name = hypothesis_name + '_subgraph'

        update_query_count = 0

        # Build an update query to add aida:Hypothesis and its importance values, as well as
        # the importance values for all event and relation clusters.
        update_str = update_prefix + 'INSERT DATA\n{\n'
        update_str += '  {} a aida:Hypothesis .\n'.format(hypothesis_name)
        update_str += '  {} aida:importance "{:.4f}"^^xsd:double .\n'.format(
            hypothesis_name, hyp_weight)
        update_str += '  {} aida:hypothesisContent {} .\n'.format(
            hypothesis_name, subgraph_name)
        update_str += '  {} a aida:Subgraph .\n'.format(subgraph_name)

        stmt_importance, node_importance = compute_importance_mapping(
            json_graph,
            hypothesis,
            member_to_clusters=mappings['member_to_clusters'],
            cluster_to_prototype=mappings['cluster_to_prototype'])

        for node_id, importance_value in node_importance.items():
            update_str += '  <{}> aida:importance "{:.4f}"^^xsd:double .\n'.format(
                node_id, importance_value)

        update_str += '}'

        output_path = output_dir / 'hypothesis-{:0>3d}-update-{:0>4d}.rq'.format(
            top_count, update_query_count)

        with open(str(output_path), 'w') as fout:
            fout.write(update_str)

        update_query_count += 1

        # Build an update query for the aida:subgraphContains field of the aida:Subgraph node as
        # the aida:hypothesisContent. We just include all ERE nodes for simplicity, as it's not
        # required that all KEs should be included for NIST to evaluate in M18.
        update_str = update_prefix
        update_str += \
            'INSERT {{\n' \
            '{} aida:subgraphContains ?e .\n' \
            '}}\nWHERE\n{{\n' \
            '{{ ?e a aida:Entity }}\nUNION\n' \
            '{{ ?e a aida:Relation }}\nUNION\n' \
            '{{ ?e a aida:Event }}\n}}\n'.format(subgraph_name)

        output_path = output_dir / 'hypothesis-{:0>3d}-update-{:0>4d}.rq'.format(
            top_count, update_query_count)
        with open(str(output_path), 'w') as fout:
            fout.write(update_str)

        update_query_count += 1

        # Build an update query for the importance value of each statement. We would need
        # a separate query for each statement, because we need to use the INSERT {} WHERE {}
        # operator here to allow BNode statements.
        for (stmt_subj, stmt_pred,
             stmt_obj), importance_value in stmt_importance.items():
            update_str = update_prefix
            update_str += \
                'INSERT {{ ?x aida:importance "{:.4f}"^^xsd:double . }}\n' \
                'WHERE\n{{\n' \
                '?x a rdf:Statement .\n' \
                '?x rdf:subject <{}> .\n' \
                '?x rdf:predicate ldcOnt:{} .\n' \
                '?x rdf:object <{}> .\n}}\n'.format(
                    importance_value, stmt_subj, stmt_pred, stmt_obj)

            output_path = output_dir / 'hypothesis-{:0>3d}-update-{:0>4d}.rq'.format(
                top_count, update_query_count)

            with open(str(output_path), 'w') as fout:
                fout.write(update_str)

            update_query_count += 1

        if top_count >= args.top:
            break
示例#12
0
def main():
    parser = ArgumentParser()
    # required positional
    parser.add_argument('graph_path', help='path to the graph JSON file')
    parser.add_argument('hypothesis_path',
                        help='path to the JSON file with hypotheses')
    parser.add_argument("roles_ontology_path", help="path to roles ontology")

    args = parser.parse_args()

    print("Reading in data...")

    # read KB
    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    # read hypotheses
    hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses')
    hypothesis_collection = AidaHypothesisCollection.from_json(
        hypotheses_json, json_graph)

    # read roles ontology
    roles_ontology = util.read_json_file(args.roles_ontology_path,
                                         'roles ontology')

    # determine all question IDs
    questionIDs = set()
    for h in hypothesis_collection:
        questionIDs.update(h.questionIDs)

    choice = question_id = restrict_core_role = restrict_core_ere = None
    while choice != "x":
        # determine core choice
        print("question IDs:", ", ".join(questionIDs))
        print("Choose from:")
        print("c: core hypothesis display")
        print("e: show events/relations connected to an ere")
        print("r: show events/relations connected to a role filler")
        print("se: survey context of an ERE independent of hypotheses")
        print("sr: survey context of a role filler independent of hypotheses")
        print("p: print hypotheses for a particular question ID")
        print(
            "R: restrict hypotheses to be considered going forward, for the rest of the run"
        )
        print("x: exit")

        choice = input()

        # determine additional restrictions on hypotheses to consider
        if choice in ["c", "e", "r", "p"]:
            question_id = input("Question ID: ")

            # filter hypotheses by question ID
            this_hypothesis_collection = filter_hypotheses_by_question(
                hypothesis_collection, question_id)

            # additionally filter by a core role filler?
            restrict_core_role = input("Optional core role to restrict: ")
            if restrict_core_role != "":
                restrict_core_ere = input(
                    "Value to restrict the core role to (ERE ID): ")

                this_hypothesis_collection = filter_hypotheses_by_entrypoints(
                    this_hypothesis_collection, json_graph, restrict_core_role,
                    restrict_core_ere)

        # execute choice
        if choice == "c":
            show_core(json_graph, this_hypothesis_collection)
        elif choice == "e":
            show_ere(json_graph, this_hypothesis_collection, roles_ontology)
        elif choice == "r":
            show_rolefiller(json_graph, this_hypothesis_collection,
                            roles_ontology)
        elif choice == "se":
            show_ere_graphenv(json_graph, roles_ontology)
        elif choice == "sr":
            show_role_graphenv(json_graph, this_hypothesis_collection,
                               roles_ontology)
        elif choice == "R":
            restrict_core_role = input("Core role to restrict: ")
            restrict_core_ere = input(
                "Value to restrict the core role to (ERE ID): ")

            hypothesis_collection = filter_hypotheses_by_entrypoints(
                hypothesis_collection, json_graph, restrict_core_role,
                restrict_core_ere)
        elif choice == "p":
            print_hypotheses(json_graph, hypothesis_collection, roles_ontology)
示例#13
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph JSON file')
    parser.add_argument('hypothesis_path', help='path to the JSON file with hypotheses')
    parser.add_argument('roles_ontology_path', help='path to the roles ontology file')
    parser.add_argument('output_dir', help='directory to write human-readable hypotheses')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph'))

    hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses')
    hypothesis_collection = AidaHypothesisCollection.from_json(hypotheses_json, json_graph)

    roles_ontology = util.read_json_file(args.roles_ontology_path, 'roles ontology')

    output_dir = util.get_output_dir(args.output_dir, overwrite_warning=True)

    output_list = []
    for hypo_idx, hypothesis in enumerate(hypothesis_collection.hypotheses):
        output_path = output_dir / 'hypothesis-{:0>3d}.txt'.format(hypo_idx)
        result, _ = hypothesis.to_str_for_csv(roles_ontology)
        with open(str(output_path), "w", encoding="utf-8") as fout:
            print(result, file=fout)
        result = result.replace(',', ' &').replace('ID: ', '')
        result_list = result.replace('\n    ', ',').split('\n\n')
        for ere_idx, res in enumerate(result_list):
            tmp_res_list = res.split(',')
            if res:
                if len(tmp_res_list[1]) < 2 or tmp_res_list[1][:2] not in 'T1T2T3T4':
                    tmp_res_list.insert(1, '')
                for _ in range(9 - len(tmp_res_list)):
                    tmp_res_list.insert(-1, '')
                for idx, tmp_res in enumerate(tmp_res_list):
                    if len(tmp_res.split(': ')) == 2 and tmp_res.split(': ')[1] == '':
                        tmp_res_list[idx] = ''
                    
                for question_ID in hypothesis.questionIDs:
                    question_ID = '_'.join(question_ID.split('_')[3:]) 
                    sin_info = question_ID + '.{}.{}'.format(hypo_idx + 1, ere_idx + 1)
                    sin_info_list = sin_info.replace('.', '_').split('_')
                    sin_info_list = tuple([int(''.join([i for i in x if i.isdigit()])) for x in sin_info_list])
                    tmp_res_list2 = copy.deepcopy(tmp_res_list)
                    tmp_res_list2.insert(0, sin_info)
                    res = ','.join(tmp_res_list2)
                    output_list.append((sin_info_list, res))
 
    output_list.sort(key=lambda x : (x[0][0], x[0][2], x[0][1], x[0][3], x[0][4]))
    csv_output_path = output_dir / args.hypothesis_path.split('/')[-1].replace('json', 'csv')
    with open(csv_output_path, 'w', encoding="utf-8") as csv_file:
        csv_file.write('SIN,Event or Relation type,time,arg1,arg2,arg3,arg4,arg5,comments,ID\n')   
        prev = tuple()
        for idx, output in enumerate(output_list):
            if idx != 0 and prev[0] != output[0][0]:
                csv_file.write('\n')
            if idx != 0 and prev[1] != output[0][1]:
                csv_file.write('\n')
            if idx != 0 and prev[2] != output[0][2]:
                csv_file.write('\n')
            if idx != 0 and prev[3] != output[0][3]:
                csv_file.write('\n')
            csv_file.write(output[1] + '\n')
            prev = output[0]
示例#14
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        'hypotheses_path',
        help='path to the input json file for hypotheses, or a directory with '
        'a list of hypotheses files')
    parser.add_argument(
        'output_dir', help='directory to write the coref-recovered hypotheses')
    parser.add_argument('original_graph_path',
                        help='path to the original graph json file')
    parser.add_argument('compressed_graph_path',
                        help='path to the compressed graph json file')
    parser.add_argument('input_log_path',
                        help='path to log file from coref compression')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    hypotheses_file_paths = util.get_file_list(args.hypotheses_path,
                                               suffix='.json',
                                               sort=True)

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    original_graph_json = util.read_json_file(args.original_graph_path,
                                              'original JSON graph')
    compressed_graph_json = util.read_json_file(args.compressed_graph_path,
                                                'compressed JSON graph')
    input_log_json = util.read_json_file(args.input_log_path, 'coref log')

    for hypotheses_file_path in hypotheses_file_paths:
        input_hypotheses_json = util.read_json_file(hypotheses_file_path,
                                                    'hypotheses')

        # probs do not change
        output_hypotheses_json = {
            'probs': input_hypotheses_json['probs'],
            'support': []
        }

        for compressed_hypothesis in input_hypotheses_json["support"]:
            original_hypothesis = {'statements': [], 'statementWeights': []}

            # The mapping from each original statement (before coref-compression) to its weight
            original_stmt_weight_mapping = {}

            # Set of cluster membership nodes to include in the original hypothesis
            cluster_membership_set = set()

            for compressed_stmt, stmt_weight in zip(
                    compressed_hypothesis['statements'],
                    compressed_hypothesis['statementWeights']):
                # Get the statement entry from the compressed graph
                compressed_stmt_entry = compressed_graph_json['theGraph'][
                    compressed_stmt]
                # Get the cluster(s) from the subject of the compressed statement
                stmt_subj_clusters = \
                    input_log_json['prototype_to_clusters'][compressed_stmt_entry['subject']]
                # Whether this is a type statement
                is_type_stmt = (compressed_stmt_entry['predicate'] == 'type')
                # Get the cluster(s) from the object of the compressed statement if it is an edge
                # statement
                if is_type_stmt:
                    stmt_obj_clusters = None
                else:
                    stmt_obj_clusters = \
                        input_log_json['prototype_to_clusters'][compressed_stmt_entry['object']]

                for original_stmt in input_log_json['new_stmt_to_old_stmts'][
                        compressed_stmt]:
                    # Resolve the statements and weights before coref-compression
                    if original_stmt not in original_stmt_weight_mapping:
                        original_stmt_weight_mapping[
                            original_stmt] = stmt_weight
                    elif original_stmt_weight_mapping[
                            original_stmt] < stmt_weight:
                        original_stmt_weight_mapping[
                            original_stmt] = stmt_weight

                    # Get the statement entry from the original graph
                    original_stmt_entry = original_graph_json['theGraph'][
                        original_stmt]

                    # Add cluster membership between the original subject and each subject cluster
                    stmt_subj = original_stmt_entry['subject']
                    for stmt_subj_cluster in stmt_subj_clusters:
                        cluster_membership_set.add(
                            (stmt_subj, stmt_subj_cluster))

                    if is_type_stmt:
                        assert original_stmt_entry['predicate'] == 'type'
                    else:
                        assert original_stmt_entry['predicate'] != 'type'

                        # Add cluster membership between the original object and each object cluster
                        stmt_obj = original_stmt_entry['object']
                        for stmt_obj_cluster in stmt_obj_clusters:
                            cluster_membership_set.add(
                                (stmt_obj, stmt_obj_cluster))

            for original_stmt, stmt_weight in original_stmt_weight_mapping.items(
            ):
                original_hypothesis['statements'].append(original_stmt)
                original_hypothesis['statementWeights'].append(stmt_weight)

            original_hypothesis['clusterMemberships'] = list(
                cluster_membership_set)

            original_hypothesis['failedQueries'] = compressed_hypothesis[
                'failedQueries']

            original_query_stmts = set()
            for compressed_query_stmt in compressed_hypothesis[
                    'queryStatements']:
                original_query_stmts.update(
                    input_log_json['new_stmt_to_old_stmts']
                    [compressed_query_stmt])
            original_hypothesis['queryStatements'] = list(original_query_stmts)

            output_hypotheses_json['support'].append(original_hypothesis)

        if 'graph' in input_hypotheses_json:
            output_hypotheses_json['graph'] = input_hypotheses_json['graph']
        if 'queries' in input_hypotheses_json:
            output_hypotheses_json['queries'] = input_hypotheses_json[
                'queries']

        output_path = util.get_output_path(output_dir /
                                           hypotheses_file_path.name,
                                           overwrite_warning=not args.force)
        print('Writing coref-recovered hypotheses to {}'.format(output_path))
        with open(str(output_path), 'w') as fout:
            json.dump(output_hypotheses_json, fout, indent=2)
示例#15
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph json file')
    parser.add_argument('hypotheses_path',
                        help='path to the hypotheses json directory')
    parser.add_argument('kb_path', help='path to the TA2 KB file (in AIF)')
    parser.add_argument('output_dir', help='path to output directory')
    parser.add_argument('run_id', help='TA3 run ID')
    parser.add_argument('sin_id_prefix',
                        help='prefix of SIN IDs to name the final hypotheses')
    parser.add_argument('--top',
                        default=50,
                        type=int,
                        help='number of top hypothesis to output')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    graph_mappings = json_graph.build_cluster_member_mappings()

    hypotheses_file_paths = util.get_file_list(args.hypotheses_path,
                                               suffix='.json',
                                               sort=True)

    # TODO: there is a known bug in rdflib that
    #  rdflib.Literal("2008", datatype=rdflib.XSD.gYear) would be parsed into
    #  rdflib.term.Literal(u'2008-01-01', datatype=rdflib.XSD.gYear) automatically,
    #  because a `parse_date` function is invoked for all rdflib.XSD.gYear literals.
    #  This is a temporary workaround to patch the _toPythonMapping locally.
    #  c.f.: https://github.com/RDFLib/rdflib/issues/806
    # noinspection PyProtectedMember
    rdflib.term._toPythonMapping.pop(rdflib.XSD['gYear'])

    print('Reading kb from {}'.format(args.kb_path))
    kb_graph = Graph()
    kb_graph.parse(args.kb_path, format='ttl')

    kb_nodes_by_category = catalogue_kb_nodes(kb_graph)

    kb_stmt_key_mapping = index_statement_nodes(
        kb_graph, kb_nodes_by_category['Statement'])
    kb_cm_key_mapping = index_cluster_membership_nodes(
        kb_graph, kb_nodes_by_category['ClusterMembership'])
    kb_type_stmt_key_mapping = index_type_statement_nodes(
        kb_graph, kb_nodes_by_category['TypeStatement'])

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    run_id = args.run_id
    sin_id_prefix = args.sin_id_prefix

    for hypotheses_file_path in hypotheses_file_paths:
        hypotheses_json = util.read_json_file(hypotheses_file_path,
                                              'hypotheses')

        print('Found {} hypotheses with probability {}'.format(
            len(hypotheses_json['probs']), hypotheses_json['probs']))

        soin_id = sin_id_prefix + '_' + hypotheses_file_path.stem.split('_')[0]
        frame_id = soin_id + '_F1'

        top_count = 0
        for hypothesis_idx, prob in sorted(enumerate(hypotheses_json['probs']),
                                           key=itemgetter(1),
                                           reverse=True):
            if prob <= 0.0:
                hypothesis_weight = math.exp(prob / 2.0)
            else:
                hypothesis_weight = 0.0001

            hypothesis = hypotheses_json['support'][hypothesis_idx]

            top_count += 1
            hypothesis_id = '{}_hypothesis_{:0>3d}'.format(frame_id, top_count)

            subgraph = build_subgraph_for_hypothesis(
                kb_graph=kb_graph,
                kb_nodes_by_category=kb_nodes_by_category,
                kb_stmt_key_mapping=kb_stmt_key_mapping,
                kb_cm_key_mapping=kb_cm_key_mapping,
                kb_type_stmt_key_mapping=kb_type_stmt_key_mapping,
                json_graph=json_graph,
                graph_mappings=graph_mappings,
                hypothesis=hypothesis,
                hypothesis_id=hypothesis_id,
                hypothesis_weight=hypothesis_weight)

            output_path = output_dir / '{}.{}.{}.H{:0>3d}.ttl'.format(
                run_id, soin_id, frame_id, top_count)
            print('Writing hypothesis #{:>2d} with prob {:>6.2f} to {}'.format(
                top_count, prob, output_path))
            with open(output_path, 'w') as fout:
                fout.write(print_graph(subgraph))

            if top_count >= args.top:
                break
示例#16
0
def main():
    parser = ArgumentParser(
        description=
        'Read in a TA2 KB and a (list of) XML-based Statement of Information Need '
        'definition, convert the KB to JSON format, then convert each SoIN to a JSON '
        'query by identifying and ranking entry points.')
    parser.add_argument('kb_path', help='Path to the input TA2 KB')
    parser.add_argument('graph_output_path',
                        help='Path to write the JSON graph')
    parser.add_argument(
        '-s',
        '--soin_path',
        help=
        'Path to the input SoIN file, or a directory containing multiple SoIN '
        'files; if not provided, will only transform the graph')
    parser.add_argument(
        '-q',
        '--query_output_dir',
        help=
        'Directory to write the JSON queries, used when soin_path is provided')
    parser.add_argument(
        '-m',
        '--max_matches',
        type=int,
        default=50,
        help='The maximum number of EPs *per entry point description*')
    parser.add_argument(
        '-d',
        '--dup_kb',
        default=duplicate_kb_file,
        help='Path to the json file with duplicate KB ID mappings')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    kb_path = util.get_input_path(args.kb_path)
    graph_output_path = util.get_output_path(args.graph_output_path,
                                             overwrite_warning=not args.force)

    aida_graph = AidaGraph()
    aida_graph.build_graph(str(kb_path), fmt='ttl')

    json_graph = JsonGraph()
    json_graph.build_graph(aida_graph)

    logging.info('Writing JSON graph to {} ...'.format(graph_output_path))
    with open(str(graph_output_path), 'w') as fout:
        json.dump(json_graph.as_dict(), fout, indent=1)
    logging.info('Done.')

    if args.soin_path is not None:
        assert args.query_output_dir is not None, 'Must provide query_output_dir'
        soin_path = util.get_input_path(args.soin_path)
        query_output_dir = util.get_output_dir(
            args.query_output_dir, overwrite_warning=not args.force)

        soin_file_paths = util.get_file_list(soin_path,
                                             suffix='.xml',
                                             sort=True)

        dup_kb_id_mapping = None
        if args.dup_kb is not None:
            dup_kb_id_mapping = util.read_json_file(args.dup_kb,
                                                    'duplicate KB ID mapping')

        logging.info('Getting Cluster Mappings ...')
        ere_to_prototypes = get_cluster_mappings(aida_graph)

        for soin_file_path in soin_file_paths:
            query_output_path = query_output_dir / (soin_file_path.stem +
                                                    '_query.json')

            logging.info('Processing SOIN {} ...'.format(soin_file_path))
            soin = SOIN.parse(str(soin_file_path),
                              dup_kbid_mapping=dup_kb_id_mapping)

            logging.info('Resolving all entrypoints ...')
            soin.resolve(aida_graph,
                         ere_to_prototypes,
                         max_matches=args.max_matches)

            query_json = {'graph': kb_path.stem}
            query_json.update(soin.to_json())

            logging.info(
                'Writing JSON query to {} ...'.format(query_output_path))
            with open(str(query_output_path), 'w') as fout:
                json.dump(query_json, fout, indent=1)