Пример #1
0
def main():
    parser = ArgumentParser()
    parser.add_argument('input_graph_path',
                        help='path to the input graph json file')
    parser.add_argument('output_graph_path',
                        help='path to write the coref-compressed graph')
    parser.add_argument('output_log_path', help='path to write the log file')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    output_graph_path = util.get_output_path(args.output_graph_path,
                                             overwrite_warning=not args.force)
    output_log_path = util.get_output_path(args.output_log_path,
                                           overwrite_warning=not args.force)

    input_json_graph = JsonGraph.from_dict(
        util.read_json_file(args.input_graph_path, 'JSON graph'))

    num_old_eres = len(list(input_json_graph.each_ere()))
    assert num_old_eres == len(input_json_graph.eres)
    num_old_stmts = len(list(input_json_graph.each_statement()))
    logging.info(
        'Found {} EREs and {} statements in the original graph'.format(
            num_old_eres, num_old_stmts))

    mappings = build_mappings(input_json_graph)

    output_json_graph = JsonGraph()

    num_new_eres = compress_eres(input_json_graph, mappings, output_json_graph)
    num_new_stmts = compress_statements(input_json_graph, mappings,
                                        output_json_graph)

    logging.info(
        'Finished coref-compressed graph with {} EREs and {} statements'.
        format(num_new_eres, num_new_stmts))

    logging.info(
        'Writing compressed json graph to {}'.format(output_graph_path))
    with open(str(output_graph_path), 'w') as fout:
        json.dump(output_json_graph.as_dict(), fout, indent=1)

    log_json = {}
    for mapping_key, mapping in mappings.items():
        if 'key' in mapping_key:
            continue
        if mapping_key.endswith('s'):
            log_json[mapping_key] = {k: list(v) for k, v in mapping.items()}
        else:
            log_json[mapping_key] = mapping

    logging.info('Writing compression log to {}'.format(output_log_path))
    with open(str(output_log_path), 'w') as fout:
        json.dump(log_json, fout, indent=2)
Пример #2
0
def main():
    parser = ArgumentParser()
    # required positional
    parser.add_argument('graph_path', help='path to the graph JSON file')
    parser.add_argument('hypothesis_path',
                        help='path to the JSON file with hypotheses')

    args = parser.parse_args()

    # read KB
    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    # read hypotheses
    hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses')
    hypothesis_collection = AidaHypothesisCollection.from_json(
        hypotheses_json, json_graph)

    analysis_obj = defaultdict(list)

    for hyp in hypothesis_collection:
        analysis_obj = hyp_stats(hyp, analysis_obj, json_graph)

    # for idx in range(len(analysis_obj["stmts"])):
    #     print("-----------Hypothesis", idx, "-------")
    #     for key, val in analysis_obj.items():
    #         print(key, ":", val[idx])

    print("================ Overall =============")
    for key, val in analysis_obj.items():
        print(key, round(sum(val) / len(val), 2))
Пример #3
0
def main():
    parser = ArgumentParser()

    parser.add_argument('graph_path', help='Path to the input graph JSON file')
    parser.add_argument('hypotheses_path',
                        help='Path to the raw hypotheses file, or a directory with multiple files')
    parser.add_argument('output_dir',
                        help='Directory to write the filtered hypothesis files(s)')
    parser.add_argument('-f', '--force', action='store_true', default=False,
                        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force)

    json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph'))
    hypotheses_file_paths = util.get_file_list(args.hypotheses_path, suffix='.json', sort=True)

    for hypotheses_file_path in hypotheses_file_paths:
        hypotheses_json = util.read_json_file(hypotheses_file_path, 'hypotheses')
        hypothesis_collection = AidaHypothesisCollection.from_json(hypotheses_json, json_graph)

        hypothesis_collection.expand()

        # create the filter
        hypothesis_filter = AidaHypothesisFilter(json_graph)

        filtered_hyplist = [hypothesis_filter.filtered(hypothesis) for hypothesis in hypothesis_collection\
                 if not hypothesis_too_short(hypothesis, json_graph)]

        filtered_hypothesis_collection = AidaHypothesisCollection(compactify(filtered_hyplist, json_graph))

        filtered_hypotheses_json = filtered_hypothesis_collection.to_json()

        # add graph filename and queries, if they were there before
        if 'graph' in hypotheses_json:
            filtered_hypotheses_json['graph'] = hypotheses_json['graph']
        if "queries" in hypotheses_json:
            filtered_hypotheses_json['queries'] = hypotheses_json['queries']

        output_path = output_dir / hypotheses_file_path.name
        logging.info('Writing filtered hypotheses to {} ...'.format(output_path))

        with open(str(output_path), 'w') as fout:
            json.dump(filtered_hypotheses_json, fout, indent=1)
Пример #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input_path', help='Path to a TA2 KB or a JSON graph')

    args = parser.parse_args()

    input_path = util.get_input_path(args.input_path)

    if input_path.suffix == '.ttl':
        aida_graph = AidaGraph()
        aida_graph.build_graph(str(input_path), fmt='ttl')

        get_kb_stats(aida_graph)

    elif input_path.suffix == '.json':
        with open(input_path, 'r') as fin:
            json_graph = JsonGraph.from_dict(json.load(fin))

        get_json_stats(json_graph)
Пример #5
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph JSON file')
    parser.add_argument('hypothesis_path', help='path to the JSON file with hypotheses')
    parser.add_argument('roles_ontology_path', help='path to the roles ontology file')
    parser.add_argument('output_dir', help='directory to write human-readable hypotheses')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph'))

    hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses')
    hypothesis_collection = AidaHypothesisCollection.from_json(hypotheses_json, json_graph)

    roles_ontology = util.read_json_file(args.roles_ontology_path, 'roles ontology')

    output_dir = util.get_output_dir(args.output_dir, overwrite_warning=True)

    for idx, hypothesis in enumerate(hypothesis_collection.hypotheses):
        output_path = output_dir / 'hypothesis-{:0>3d}.txt'.format(idx)
        with open(str(output_path), "w", encoding="utf-8") as fout:
            print(hypothesis.to_str(roles_ontology), file=fout)
Пример #6
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='Path to the input graph JSON file')
    parser.add_argument(
        'raw_seeds_path',
        help='Path to the raw hypothesis seeds file, or a directory with '
        'multiple seeds files')
    parser.add_argument(
        'output_dir', help='Directory to write the reranked hypothesis seeds')
    parser.add_argument('--plausibility_model_path',
                        help='Path to a hypothesis plausibility model')
    parser.add_argument('--indexer_path', help="Path to the indexers file")
    parser.add_argument('-n',
                        '--max_num_seeds',
                        type=int,
                        default=None,
                        help='Only output up to n hypothesis seeds')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    raw_seeds_file_paths = util.get_file_list(args.raw_seeds_path,
                                              suffix='.json',
                                              sort=True)

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    for raw_seeds_file_path in raw_seeds_file_paths:
        raw_seeds_json = util.read_json_file(raw_seeds_file_path,
                                             'seeds by facet')
        seeds_by_facet = {}
        for facet_label, seeds_json in raw_seeds_json.items():
            if facet_label != 'graph':
                seeds_by_facet[facet_label] = [
                    HypothesisSeed.from_json(seed_json, json_graph)
                    for seed_json in seeds_json
                ]

        if args.plausibility_model_path is not None and args.indexer_path is not None:
            seeds_by_facet = rerank_seeds_by_plausibility(
                seeds_by_facet, args.graph_path, args.plausibility_model_path,
                args.indexer_path)

        seeds = select_seeds_by_novelty(seeds_by_facet, args.max_num_seeds)

        hypotheses_to_export = []

        # turn ranks into the log weights of seed hypotheses
        # meaningless numbers. just assign 1/2, 1/3, 1/4, ...
        for rank, seed in enumerate(seeds):
            seed.hypothesis.update_weight(math.log(1.0 / (rank + 1)))
            hypotheses_to_export.append(seed.finalize())

        hypothesis_collection = AidaHypothesisCollection(hypotheses_to_export)

        seeds_json = hypothesis_collection.to_json()
        seeds_json['graph'] = raw_seeds_json['graph']

        output_path = output_dir / (raw_seeds_file_path.name.split('_')[0] +
                                    '_seeds.json')
        logging.info(
            'Writing re-ranked hypothesis seeds to {} ...'.format(output_path))
        with open(str(output_path), 'w') as fout:
            json.dump(seeds_json, fout, indent=1)
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph json file')
    parser.add_argument('hypotheses_path',
                        help='path to the hypotheses json file')
    parser.add_argument('output_dir', help='Directory to write queries')
    parser.add_argument('--top',
                        default=50,
                        type=int,
                        help='number of top hypothesis to output')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))
    mappings = json_graph.build_cluster_member_mappings()

    hypotheses_json = util.read_json_file(args.hypotheses_path, 'hypotheses')

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    top_count = 0

    for result_idx, prob in sorted(enumerate(hypotheses_json['probs']),
                                   key=itemgetter(1),
                                   reverse=True):
        hypothesis = hypotheses_json['support'][result_idx]

        top_count += 1

        update_str = update_prefix + 'INSERT DATA\n{\n'

        prototype_handles = compute_handle_mapping(
            json_graph,
            hypothesis,
            member_to_clusters=mappings['member_to_clusters'],
            cluster_to_prototype=mappings['cluster_to_prototype'])

        for prototype, handle in prototype_handles.items():
            handle = handle.lstrip('"')
            handle = handle.rstrip('"')
            update_str += '  <{}> aida:handle "{}" .\n'.format(
                prototype, handle)

        update_str += '}'

        output_path = output_dir / 'hypothesis-{:0>3d}-update.rq'.format(
            top_count)

        with open(str(output_path), 'w') as fout:
            fout.write(update_str)

        if top_count >= args.top:
            break
Пример #8
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph json file')
    parser.add_argument('hypotheses_path', help='path to the hypotheses json file')
    parser.add_argument('db_dir', help='directory with copies of tdb databases')
    parser.add_argument('output_dir', help='path to output directory')
    parser.add_argument('--top', default=50, type=int,

                        help='number of top hypothesis to output')
    parser.add_argument('--dry_run', action='store_true',
                        help='if specified, only write the SPARQL queries to '
                             'files, without actually executing the queries')
    parser.add_argument('--query_just', action='store_true')
    parser.add_argument('--query_conf', action='store_true')
    parser.add_argument('-f', '--force', action='store_true', default=False,
                        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph'))

    mappings = json_graph.build_cluster_member_mappings()
    member_to_clusters = mappings['member_to_clusters']
    cluster_to_prototype = mappings['cluster_to_prototype']
    prototype_set = set(mappings['prototype_to_clusters'].keys())

    hypotheses_json = util.read_json_file(args.hypotheses_path, 'hypotheses')

    output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force)

    db_dir = util.get_input_path(args.db_dir)
    db_path_list = [str(path) for path in sorted(db_dir.glob('copy*'))]
    print('Using the following tdb databases to query: {}'.format(db_path_list))

    num_node_queries = len(db_path_list)

    top_count = 0
    for result_idx, prob in sorted(
            enumerate(hypotheses_json['probs']), key=itemgetter(1), reverse=True):
        hypothesis = hypotheses_json['support'][result_idx]
        # node_query_list, stmt_query_list, just_query_list, conf_query_list = \
        sparql_query_str = \
            queries_for_aida_result(
                json_graph=json_graph,
                hypothesis=hypothesis,
                member_to_clusters=member_to_clusters,
                cluster_to_prototype=cluster_to_prototype,
                prototype_set=prototype_set,
                num_node_queries=num_node_queries,
                query_just=args.query_just,
                query_conf=args.query_conf)

        top_count += 1

        print(f'Writing queries for hypothesis #{top_count} with prob {prob}')

        sparql_query_path = output_dir / 'hypothesis-{:0>3d}-query.rq'.format(top_count)
        with open(str(sparql_query_path), 'w') as fout:
            fout.write(sparql_query_str + '\n')

        if not args.dry_run:
            query_result_path = output_dir / 'hypothesis-{:0>3d}-raw.ttl'.format(top_count)
            query_cmd = 'echo "query {0}"; tdbquery --loc {1} --query {0} > {2}; '.format(
                sparql_query_path, db_path_list[0], query_result_path)

            print('Executing queries ...')
            process = subprocess.Popen(query_cmd, shell=True)
            process.wait()

        # sparql_helper.execute_sparql_queries(
        #     node_query_list, stmt_query_list, just_query_list, conf_query_list,
        #     db_path_list, output_dir,
        #     filename_prefix='hypothesis-{:0>3d}'.format(top_count),
        #     header_prefixes=AIF_HEADER_PREFIXES, dry_run=args.dry_run)

        if top_count >= args.top:
            break
Пример #9
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='Path to the input graph JSON file')
    parser.add_argument(
        'query_path',
        help=
        'Path to the input query file, or a directory with multiple queries')
    parser.add_argument('output_dir',
                        help='Directory to write the raw hypothesis seeds')
    parser.add_argument(
        '-n',
        '--max_num_seeds_per_facet',
        type=int,
        default=None,
        help='If provided, only save up to <arg> seeds per facet')
    parser.add_argument(
        '-d',
        '--discard_failed_core_constraints',
        action='store_true',
        help='If specified, discard hypotheses with failed core constraints. '
        'Try not to use this one during evaluation at first, so that we '
        'do not discard hypotheses we might still need. If we have too many '
        'hypotheses and the script runs too slowly, then use this.')
    parser.add_argument(
        '-r',
        '--rank_cutoff',
        type=int,
        default=100,
        help=
        'If specified, discard hypotheses early if there are at least <arg> '
        'other hypotheses that have the same fillers for a certain number '
        '(default = 3) of their non-entrypoint query variables. We might '
        'need this in the evaluation if some facets have many variables '
        'that lead to combinatorial explosion.')
    parser.add_argument(
        '--frame_grouping',
        action='store_true',
        help=
        'If specified, group query constraints by frames instead of by facets')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    query_file_paths = util.get_file_list(args.query_path,
                                          suffix='.json',
                                          sort=True)

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    for query_file_path in query_file_paths:
        query_json = util.read_json_file(query_file_path, 'query')

        raw_seeds_json = make_cluster_seeds(
            json_graph=json_graph,
            query_json=query_json,
            max_num_seeds_per_facet=args.max_num_seeds_per_facet,
            frame_grouping=args.frame_grouping,
            discard_failed_core_constraints=args.
            discard_failed_core_constraints,
            rank_cutoff=args.rank_cutoff)

        # write hypotheses out in json format.
        output_path = output_dir / (query_file_path.name.split('_')[0] +
                                    '_seeds.json')
        logging.info(
            'Writing raw hypothesis seeds of each facet to {} ...'.format(
                output_path))
        with open(str(output_path), 'w') as fout:
            json.dump(raw_seeds_json, fout, indent=1)
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph json file')
    parser.add_argument('hypotheses_path',
                        help='path to the hypotheses json file')
    parser.add_argument('output_dir', help='Directory to write queries')
    parser.add_argument('frame_id', help='Frame ID of the hypotheses')
    parser.add_argument('--top',
                        default=50,
                        type=int,
                        help='number of top hypothesis to output')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))
    mappings = json_graph.build_cluster_member_mappings()

    hypotheses_json = util.read_json_file(args.hypotheses_path, 'hypotheses')

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)
    frame_id = args.frame_id

    top_count = 0

    for result_idx, prob in sorted(enumerate(hypotheses_json['probs']),
                                   key=itemgetter(1),
                                   reverse=True):
        if prob <= 0.0:
            hyp_weight = math.exp(prob / 2.0)
        else:
            hyp_weight = 0.0001

        hypothesis = hypotheses_json['support'][result_idx]

        top_count += 1

        hypothesis_id = '{}_hypothesis_{:0>3d}'.format(frame_id, top_count)

        hypothesis_name = 'utexas:{}'.format(hypothesis_id)
        subgraph_name = hypothesis_name + '_subgraph'

        update_query_count = 0

        # Build an update query to add aida:Hypothesis and its importance values, as well as
        # the importance values for all event and relation clusters.
        update_str = update_prefix + 'INSERT DATA\n{\n'
        update_str += '  {} a aida:Hypothesis .\n'.format(hypothesis_name)
        update_str += '  {} aida:importance "{:.4f}"^^xsd:double .\n'.format(
            hypothesis_name, hyp_weight)
        update_str += '  {} aida:hypothesisContent {} .\n'.format(
            hypothesis_name, subgraph_name)
        update_str += '  {} a aida:Subgraph .\n'.format(subgraph_name)

        stmt_importance, node_importance = compute_importance_mapping(
            json_graph,
            hypothesis,
            member_to_clusters=mappings['member_to_clusters'],
            cluster_to_prototype=mappings['cluster_to_prototype'])

        for node_id, importance_value in node_importance.items():
            update_str += '  <{}> aida:importance "{:.4f}"^^xsd:double .\n'.format(
                node_id, importance_value)

        update_str += '}'

        output_path = output_dir / 'hypothesis-{:0>3d}-update-{:0>4d}.rq'.format(
            top_count, update_query_count)

        with open(str(output_path), 'w') as fout:
            fout.write(update_str)

        update_query_count += 1

        # Build an update query for the aida:subgraphContains field of the aida:Subgraph node as
        # the aida:hypothesisContent. We just include all ERE nodes for simplicity, as it's not
        # required that all KEs should be included for NIST to evaluate in M18.
        update_str = update_prefix
        update_str += \
            'INSERT {{\n' \
            '{} aida:subgraphContains ?e .\n' \
            '}}\nWHERE\n{{\n' \
            '{{ ?e a aida:Entity }}\nUNION\n' \
            '{{ ?e a aida:Relation }}\nUNION\n' \
            '{{ ?e a aida:Event }}\n}}\n'.format(subgraph_name)

        output_path = output_dir / 'hypothesis-{:0>3d}-update-{:0>4d}.rq'.format(
            top_count, update_query_count)
        with open(str(output_path), 'w') as fout:
            fout.write(update_str)

        update_query_count += 1

        # Build an update query for the importance value of each statement. We would need
        # a separate query for each statement, because we need to use the INSERT {} WHERE {}
        # operator here to allow BNode statements.
        for (stmt_subj, stmt_pred,
             stmt_obj), importance_value in stmt_importance.items():
            update_str = update_prefix
            update_str += \
                'INSERT {{ ?x aida:importance "{:.4f}"^^xsd:double . }}\n' \
                'WHERE\n{{\n' \
                '?x a rdf:Statement .\n' \
                '?x rdf:subject <{}> .\n' \
                '?x rdf:predicate ldcOnt:{} .\n' \
                '?x rdf:object <{}> .\n}}\n'.format(
                    importance_value, stmt_subj, stmt_pred, stmt_obj)

            output_path = output_dir / 'hypothesis-{:0>3d}-update-{:0>4d}.rq'.format(
                top_count, update_query_count)

            with open(str(output_path), 'w') as fout:
                fout.write(update_str)

            update_query_count += 1

        if top_count >= args.top:
            break
Пример #11
0
def main():
    parser = ArgumentParser()
    # required positional
    parser.add_argument('graph_path', help='path to the graph JSON file')
    parser.add_argument('hypothesis_path',
                        help='path to the JSON file with hypotheses')
    parser.add_argument("roles_ontology_path", help="path to roles ontology")

    args = parser.parse_args()

    print("Reading in data...")

    # read KB
    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    # read hypotheses
    hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses')
    hypothesis_collection = AidaHypothesisCollection.from_json(
        hypotheses_json, json_graph)

    # read roles ontology
    roles_ontology = util.read_json_file(args.roles_ontology_path,
                                         'roles ontology')

    # determine all question IDs
    questionIDs = set()
    for h in hypothesis_collection:
        questionIDs.update(h.questionIDs)

    choice = question_id = restrict_core_role = restrict_core_ere = None
    while choice != "x":
        # determine core choice
        print("question IDs:", ", ".join(questionIDs))
        print("Choose from:")
        print("c: core hypothesis display")
        print("e: show events/relations connected to an ere")
        print("r: show events/relations connected to a role filler")
        print("se: survey context of an ERE independent of hypotheses")
        print("sr: survey context of a role filler independent of hypotheses")
        print("p: print hypotheses for a particular question ID")
        print(
            "R: restrict hypotheses to be considered going forward, for the rest of the run"
        )
        print("x: exit")

        choice = input()

        # determine additional restrictions on hypotheses to consider
        if choice in ["c", "e", "r", "p"]:
            question_id = input("Question ID: ")

            # filter hypotheses by question ID
            this_hypothesis_collection = filter_hypotheses_by_question(
                hypothesis_collection, question_id)

            # additionally filter by a core role filler?
            restrict_core_role = input("Optional core role to restrict: ")
            if restrict_core_role != "":
                restrict_core_ere = input(
                    "Value to restrict the core role to (ERE ID): ")

                this_hypothesis_collection = filter_hypotheses_by_entrypoints(
                    this_hypothesis_collection, json_graph, restrict_core_role,
                    restrict_core_ere)

        # execute choice
        if choice == "c":
            show_core(json_graph, this_hypothesis_collection)
        elif choice == "e":
            show_ere(json_graph, this_hypothesis_collection, roles_ontology)
        elif choice == "r":
            show_rolefiller(json_graph, this_hypothesis_collection,
                            roles_ontology)
        elif choice == "se":
            show_ere_graphenv(json_graph, roles_ontology)
        elif choice == "sr":
            show_role_graphenv(json_graph, this_hypothesis_collection,
                               roles_ontology)
        elif choice == "R":
            restrict_core_role = input("Core role to restrict: ")
            restrict_core_ere = input(
                "Value to restrict the core role to (ERE ID): ")

            hypothesis_collection = filter_hypotheses_by_entrypoints(
                hypothesis_collection, json_graph, restrict_core_role,
                restrict_core_ere)
        elif choice == "p":
            print_hypotheses(json_graph, hypothesis_collection, roles_ontology)
Пример #12
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph JSON file')
    parser.add_argument('hypothesis_path', help='path to the JSON file with hypotheses')
    parser.add_argument('roles_ontology_path', help='path to the roles ontology file')
    parser.add_argument('output_dir', help='directory to write human-readable hypotheses')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph'))

    hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses')
    hypothesis_collection = AidaHypothesisCollection.from_json(hypotheses_json, json_graph)

    roles_ontology = util.read_json_file(args.roles_ontology_path, 'roles ontology')

    output_dir = util.get_output_dir(args.output_dir, overwrite_warning=True)

    output_list = []
    for hypo_idx, hypothesis in enumerate(hypothesis_collection.hypotheses):
        output_path = output_dir / 'hypothesis-{:0>3d}.txt'.format(hypo_idx)
        result, _ = hypothesis.to_str_for_csv(roles_ontology)
        with open(str(output_path), "w", encoding="utf-8") as fout:
            print(result, file=fout)
        result = result.replace(',', ' &').replace('ID: ', '')
        result_list = result.replace('\n    ', ',').split('\n\n')
        for ere_idx, res in enumerate(result_list):
            tmp_res_list = res.split(',')
            if res:
                if len(tmp_res_list[1]) < 2 or tmp_res_list[1][:2] not in 'T1T2T3T4':
                    tmp_res_list.insert(1, '')
                for _ in range(9 - len(tmp_res_list)):
                    tmp_res_list.insert(-1, '')
                for idx, tmp_res in enumerate(tmp_res_list):
                    if len(tmp_res.split(': ')) == 2 and tmp_res.split(': ')[1] == '':
                        tmp_res_list[idx] = ''
                    
                for question_ID in hypothesis.questionIDs:
                    question_ID = '_'.join(question_ID.split('_')[3:]) 
                    sin_info = question_ID + '.{}.{}'.format(hypo_idx + 1, ere_idx + 1)
                    sin_info_list = sin_info.replace('.', '_').split('_')
                    sin_info_list = tuple([int(''.join([i for i in x if i.isdigit()])) for x in sin_info_list])
                    tmp_res_list2 = copy.deepcopy(tmp_res_list)
                    tmp_res_list2.insert(0, sin_info)
                    res = ','.join(tmp_res_list2)
                    output_list.append((sin_info_list, res))
 
    output_list.sort(key=lambda x : (x[0][0], x[0][2], x[0][1], x[0][3], x[0][4]))
    csv_output_path = output_dir / args.hypothesis_path.split('/')[-1].replace('json', 'csv')
    with open(csv_output_path, 'w', encoding="utf-8") as csv_file:
        csv_file.write('SIN,Event or Relation type,time,arg1,arg2,arg3,arg4,arg5,comments,ID\n')   
        prev = tuple()
        for idx, output in enumerate(output_list):
            if idx != 0 and prev[0] != output[0][0]:
                csv_file.write('\n')
            if idx != 0 and prev[1] != output[0][1]:
                csv_file.write('\n')
            if idx != 0 and prev[2] != output[0][2]:
                csv_file.write('\n')
            if idx != 0 and prev[3] != output[0][3]:
                csv_file.write('\n')
            csv_file.write(output[1] + '\n')
            prev = output[0]
Пример #13
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph json file')
    parser.add_argument('hypotheses_path',
                        help='path to the hypotheses json directory')
    parser.add_argument('kb_path', help='path to the TA2 KB file (in AIF)')
    parser.add_argument('output_dir', help='path to output directory')
    parser.add_argument('run_id', help='TA3 run ID')
    parser.add_argument('sin_id_prefix',
                        help='prefix of SIN IDs to name the final hypotheses')
    parser.add_argument('--top',
                        default=50,
                        type=int,
                        help='number of top hypothesis to output')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    graph_mappings = json_graph.build_cluster_member_mappings()

    hypotheses_file_paths = util.get_file_list(args.hypotheses_path,
                                               suffix='.json',
                                               sort=True)

    # TODO: there is a known bug in rdflib that
    #  rdflib.Literal("2008", datatype=rdflib.XSD.gYear) would be parsed into
    #  rdflib.term.Literal(u'2008-01-01', datatype=rdflib.XSD.gYear) automatically,
    #  because a `parse_date` function is invoked for all rdflib.XSD.gYear literals.
    #  This is a temporary workaround to patch the _toPythonMapping locally.
    #  c.f.: https://github.com/RDFLib/rdflib/issues/806
    # noinspection PyProtectedMember
    rdflib.term._toPythonMapping.pop(rdflib.XSD['gYear'])

    print('Reading kb from {}'.format(args.kb_path))
    kb_graph = Graph()
    kb_graph.parse(args.kb_path, format='ttl')

    kb_nodes_by_category = catalogue_kb_nodes(kb_graph)

    kb_stmt_key_mapping = index_statement_nodes(
        kb_graph, kb_nodes_by_category['Statement'])
    kb_cm_key_mapping = index_cluster_membership_nodes(
        kb_graph, kb_nodes_by_category['ClusterMembership'])
    kb_type_stmt_key_mapping = index_type_statement_nodes(
        kb_graph, kb_nodes_by_category['TypeStatement'])

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    run_id = args.run_id
    sin_id_prefix = args.sin_id_prefix

    for hypotheses_file_path in hypotheses_file_paths:
        hypotheses_json = util.read_json_file(hypotheses_file_path,
                                              'hypotheses')

        print('Found {} hypotheses with probability {}'.format(
            len(hypotheses_json['probs']), hypotheses_json['probs']))

        soin_id = sin_id_prefix + '_' + hypotheses_file_path.stem.split('_')[0]
        frame_id = soin_id + '_F1'

        top_count = 0
        for hypothesis_idx, prob in sorted(enumerate(hypotheses_json['probs']),
                                           key=itemgetter(1),
                                           reverse=True):
            if prob <= 0.0:
                hypothesis_weight = math.exp(prob / 2.0)
            else:
                hypothesis_weight = 0.0001

            hypothesis = hypotheses_json['support'][hypothesis_idx]

            top_count += 1
            hypothesis_id = '{}_hypothesis_{:0>3d}'.format(frame_id, top_count)

            subgraph = build_subgraph_for_hypothesis(
                kb_graph=kb_graph,
                kb_nodes_by_category=kb_nodes_by_category,
                kb_stmt_key_mapping=kb_stmt_key_mapping,
                kb_cm_key_mapping=kb_cm_key_mapping,
                kb_type_stmt_key_mapping=kb_type_stmt_key_mapping,
                json_graph=json_graph,
                graph_mappings=graph_mappings,
                hypothesis=hypothesis,
                hypothesis_id=hypothesis_id,
                hypothesis_weight=hypothesis_weight)

            output_path = output_dir / '{}.{}.{}.H{:0>3d}.ttl'.format(
                run_id, soin_id, frame_id, top_count)
            print('Writing hypothesis #{:>2d} with prob {:>6.2f} to {}'.format(
                top_count, prob, output_path))
            with open(output_path, 'w') as fout:
                fout.write(print_graph(subgraph))

            if top_count >= args.top:
                break