示例#1
0
def main():
    parser = ArgumentParser()
    parser.add_argument('input_graph_path',
                        help='path to the input graph json file')
    parser.add_argument('output_graph_path',
                        help='path to write the coref-compressed graph')
    parser.add_argument('output_log_path', help='path to write the log file')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    output_graph_path = util.get_output_path(args.output_graph_path,
                                             overwrite_warning=not args.force)
    output_log_path = util.get_output_path(args.output_log_path,
                                           overwrite_warning=not args.force)

    input_json_graph = JsonGraph.from_dict(
        util.read_json_file(args.input_graph_path, 'JSON graph'))

    num_old_eres = len(list(input_json_graph.each_ere()))
    assert num_old_eres == len(input_json_graph.eres)
    num_old_stmts = len(list(input_json_graph.each_statement()))
    logging.info(
        'Found {} EREs and {} statements in the original graph'.format(
            num_old_eres, num_old_stmts))

    mappings = build_mappings(input_json_graph)

    output_json_graph = JsonGraph()

    num_new_eres = compress_eres(input_json_graph, mappings, output_json_graph)
    num_new_stmts = compress_statements(input_json_graph, mappings,
                                        output_json_graph)

    logging.info(
        'Finished coref-compressed graph with {} EREs and {} statements'.
        format(num_new_eres, num_new_stmts))

    logging.info(
        'Writing compressed json graph to {}'.format(output_graph_path))
    with open(str(output_graph_path), 'w') as fout:
        json.dump(output_json_graph.as_dict(), fout, indent=1)

    log_json = {}
    for mapping_key, mapping in mappings.items():
        if 'key' in mapping_key:
            continue
        if mapping_key.endswith('s'):
            log_json[mapping_key] = {k: list(v) for k, v in mapping.items()}
        else:
            log_json[mapping_key] = mapping

    logging.info('Writing compression log to {}'.format(output_log_path))
    with open(str(output_log_path), 'w') as fout:
        json.dump(log_json, fout, indent=2)
示例#2
0
def main():
    parser = ArgumentParser()
    # required positional
    parser.add_argument('graph_path', help='path to the graph JSON file')
    parser.add_argument('hypothesis_path',
                        help='path to the JSON file with hypotheses')

    args = parser.parse_args()

    # read KB
    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    # read hypotheses
    hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses')
    hypothesis_collection = AidaHypothesisCollection.from_json(
        hypotheses_json, json_graph)

    analysis_obj = defaultdict(list)

    for hyp in hypothesis_collection:
        analysis_obj = hyp_stats(hyp, analysis_obj, json_graph)

    # for idx in range(len(analysis_obj["stmts"])):
    #     print("-----------Hypothesis", idx, "-------")
    #     for key, val in analysis_obj.items():
    #         print(key, ":", val[idx])

    print("================ Overall =============")
    for key, val in analysis_obj.items():
        print(key, round(sum(val) / len(val), 2))
示例#3
0
def build_mappings(json_graph: JsonGraph):
    # Build mappings among clusters, members, and prototypes
    mappings = json_graph.build_cluster_member_mappings()

    # Build mappings from old statement labels to new statement labels
    stmt_count = 0

    stmt_key_to_new_stmt = {}
    new_stmt_to_stmt_key = {}
    old_stmt_to_new_stmts = defaultdict(set)
    new_stmt_to_old_stmts = defaultdict(set)

    for node_label, node in json_graph.node_dict.items():
        if node.type == 'Statement':
            stmt_keys = make_stmt_keys(
                stmt_entry=node,
                member_to_prototypes=mappings['member_to_prototypes'])
            for stmt_key in stmt_keys:
                if stmt_key not in stmt_key_to_new_stmt:
                    new_stmt_label = 'Statement-{}'.format(stmt_count)
                    stmt_count += 1
                    stmt_key_to_new_stmt[stmt_key] = new_stmt_label
                    new_stmt_to_stmt_key[new_stmt_label] = stmt_key
                else:
                    new_stmt_label = stmt_key_to_new_stmt[stmt_key]

                old_stmt_to_new_stmts[node_label].add(new_stmt_label)
                new_stmt_to_old_stmts[new_stmt_label].add(node_label)

    num_old_stmts = len(old_stmt_to_new_stmts)
    num_new_stmts = len(new_stmt_to_old_stmts)

    assert len(stmt_key_to_new_stmt) == num_new_stmts
    assert len(new_stmt_to_stmt_key) == num_new_stmts

    print('\nConstructed mapping from {} old statements to {} new statements'.
          format(num_old_stmts, num_new_stmts))

    new_stmts_per_old_stmt_counter = Counter(
        [len(v) for v in old_stmt_to_new_stmts.values()])
    for key in sorted(new_stmts_per_old_stmt_counter.keys()):
        if key > 1:
            print(
                '\tFor {} out of {} old statements, each is mapped to {} new statements'
                .format(new_stmts_per_old_stmt_counter[key], num_old_stmts,
                        key))

    mappings.update({
        'stmt_key_to_new_stmt': stmt_key_to_new_stmt,
        'new_stmt_to_stmt_key': new_stmt_to_stmt_key,
        'old_stmt_to_new_stmts': old_stmt_to_new_stmts,
        'new_stmt_to_old_stmts': new_stmt_to_old_stmts
    })

    return mappings
示例#4
0
def compress_statements(input_json_graph: JsonGraph, mappings: Dict,
                        output_json_graph: JsonGraph):
    logging.info('Building statement entries for the compressed graph ...')

    assert len(output_json_graph.statements) == 0

    num_new_stmts = 0

    for new_stmt, stmt_key in mappings['new_stmt_to_stmt_key'].items():
        stmt_idx = int(new_stmt.split('-')[1])
        subj, pred, obj = stmt_key
        new_entry = {
            'type': 'Statement',
            'index': stmt_idx,
            'subject': subj,
            'predicate': pred,
            'object': obj
        }

        conf_levels = set()
        for old_stmt in mappings['new_stmt_to_old_stmts'][new_stmt]:
            old_stmt_entry = input_json_graph.node_dict[old_stmt]
            if old_stmt_entry.conf is not None:
                conf_levels.add(old_stmt_entry.conf)

        new_entry['conf'] = max(conf_levels) if conf_levels else None

        # old_stmt_entry_list = [input_json_graph.node_dict[old_stmt]
        #                        for old_stmt in mappings['new_stmt_to_old_stmts'][new_stmt]]
        #
        # # Resolve the extra information (source and hypotheses) of the new
        # # statement
        # for label in ['source', 'hypotheses_supported', 'hypotheses_partially_supported',
        #               'hypotheses_contradicted']:
        #     label_value_set = set()
        #     for old_stmt_entry in old_stmt_entry_list:
        #         if label in old_stmt_entry:
        #             label_value_set.update(old_stmt_entry[label])
        #     if len(label_value_set) > 0:
        #         new_entry[label] = list(label_value_set)

        output_json_graph.node_dict[new_stmt] = StatementNode(**new_entry)
        output_json_graph.statements.append(new_stmt)
        num_new_stmts += 1

    return num_new_stmts
def main():
    parser = ArgumentParser()

    parser.add_argument('graph_path', help='Path to the input graph JSON file')
    parser.add_argument('hypotheses_path',
                        help='Path to the raw hypotheses file, or a directory with multiple files')
    parser.add_argument('output_dir',
                        help='Directory to write the filtered hypothesis files(s)')
    parser.add_argument('-f', '--force', action='store_true', default=False,
                        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force)

    json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph'))
    hypotheses_file_paths = util.get_file_list(args.hypotheses_path, suffix='.json', sort=True)

    for hypotheses_file_path in hypotheses_file_paths:
        hypotheses_json = util.read_json_file(hypotheses_file_path, 'hypotheses')
        hypothesis_collection = AidaHypothesisCollection.from_json(hypotheses_json, json_graph)

        hypothesis_collection.expand()

        # create the filter
        hypothesis_filter = AidaHypothesisFilter(json_graph)

        filtered_hyplist = [hypothesis_filter.filtered(hypothesis) for hypothesis in hypothesis_collection\
                 if not hypothesis_too_short(hypothesis, json_graph)]

        filtered_hypothesis_collection = AidaHypothesisCollection(compactify(filtered_hyplist, json_graph))

        filtered_hypotheses_json = filtered_hypothesis_collection.to_json()

        # add graph filename and queries, if they were there before
        if 'graph' in hypotheses_json:
            filtered_hypotheses_json['graph'] = hypotheses_json['graph']
        if "queries" in hypotheses_json:
            filtered_hypotheses_json['queries'] = hypotheses_json['queries']

        output_path = output_dir / hypotheses_file_path.name
        logging.info('Writing filtered hypotheses to {} ...'.format(output_path))

        with open(str(output_path), 'w') as fout:
            json.dump(filtered_hypotheses_json, fout, indent=1)
示例#6
0
文件: stats.py 项目: pxch/aida-utexas
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input_path', help='Path to a TA2 KB or a JSON graph')

    args = parser.parse_args()

    input_path = util.get_input_path(args.input_path)

    if input_path.suffix == '.ttl':
        aida_graph = AidaGraph()
        aida_graph.build_graph(str(input_path), fmt='ttl')

        get_kb_stats(aida_graph)

    elif input_path.suffix == '.json':
        with open(input_path, 'r') as fin:
            json_graph = JsonGraph.from_dict(json.load(fin))

        get_json_stats(json_graph)
示例#7
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph JSON file')
    parser.add_argument('hypothesis_path', help='path to the JSON file with hypotheses')
    parser.add_argument('roles_ontology_path', help='path to the roles ontology file')
    parser.add_argument('output_dir', help='directory to write human-readable hypotheses')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph'))

    hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses')
    hypothesis_collection = AidaHypothesisCollection.from_json(hypotheses_json, json_graph)

    roles_ontology = util.read_json_file(args.roles_ontology_path, 'roles ontology')

    output_dir = util.get_output_dir(args.output_dir, overwrite_warning=True)

    for idx, hypothesis in enumerate(hypothesis_collection.hypotheses):
        output_path = output_dir / 'hypothesis-{:0>3d}.txt'.format(idx)
        with open(str(output_path), "w", encoding="utf-8") as fout:
            print(hypothesis.to_str(roles_ontology), file=fout)
示例#8
0
def compress_eres(input_json_graph: JsonGraph, mappings: Dict,
                  output_json_graph: JsonGraph):
    assert len(output_json_graph.eres) == 0

    logging.info(
        'Building ERE / SameAsCluster / ClusterMembership entries for the compressed '
        'graph ...')

    num_new_eres = 0

    for prototype, members in mappings['prototype_to_members'].items():
        old_entry = input_json_graph.node_dict[prototype]

        # Use the same ERE index from the original graph
        new_entry = {'index': old_entry.index}

        member_entry_list = [
            input_json_graph.node_dict[member] for member in members
        ]

        # Resolve the type of the compressed ERE node
        type_set = set(member_entry.type for member_entry in member_entry_list)
        # if len(type_set) > 1:
        #     type_set.remove('Entity')
        if len(type_set) > 1:
            logging.warning(
                'Error: multiple types {} from the following EREs {}'.format(
                    type_set, members))
        new_entry['type'] = type_set.pop()

        # Resolve the adjacent statements of the compressed ERE node
        adjacency_set = set()
        for member_entry in member_entry_list:
            for old_stmt in member_entry.adjacent:
                adjacency_set.update(
                    mappings['old_stmt_to_new_stmts'][old_stmt])
        new_entry['adjacent'] = list(adjacency_set)

        # Resolve the names of the compressed ERE node
        name_set = set()
        for member_entry in member_entry_list:
            name_set.update(member_entry.name)
        for cluster in mappings['prototype_to_clusters'][prototype]:
            cluster_handle = input_json_graph.node_dict[cluster].handle
            if cluster_handle is not None and cluster_handle != '[unknown]':
                name_set.add(cluster_handle)
        new_entry['name'] = list(name_set)

        # Resolve the LDC time list of the compressed ERE node
        ldc_time_list = []
        for member_entry in member_entry_list:
            ldc_time_list.extend(member_entry.ldcTime)
        new_entry['ldcTime'] = ldc_time_list

        output_json_graph.node_dict[prototype] = ERENode(**new_entry)
        output_json_graph.eres.append(prototype)

        # Add SameAsCluster nodes and ClusterMembership nodes
        for cluster in mappings['prototype_to_clusters'][prototype]:
            output_json_graph.node_dict[cluster] = deepcopy(
                input_json_graph.node_dict[cluster])

            for cluster_membership_key in \
                    mappings['cluster_membership_key_mapping'][(cluster, prototype)]:
                output_json_graph.node_dict[cluster_membership_key] = deepcopy(
                    input_json_graph.node_dict[cluster_membership_key])

        num_new_eres += 1

    return num_new_eres
示例#9
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph json file')
    parser.add_argument('hypotheses_path', help='path to the hypotheses json file')
    parser.add_argument('db_dir', help='directory with copies of tdb databases')
    parser.add_argument('output_dir', help='path to output directory')
    parser.add_argument('--top', default=50, type=int,

                        help='number of top hypothesis to output')
    parser.add_argument('--dry_run', action='store_true',
                        help='if specified, only write the SPARQL queries to '
                             'files, without actually executing the queries')
    parser.add_argument('--query_just', action='store_true')
    parser.add_argument('--query_conf', action='store_true')
    parser.add_argument('-f', '--force', action='store_true', default=False,
                        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph'))

    mappings = json_graph.build_cluster_member_mappings()
    member_to_clusters = mappings['member_to_clusters']
    cluster_to_prototype = mappings['cluster_to_prototype']
    prototype_set = set(mappings['prototype_to_clusters'].keys())

    hypotheses_json = util.read_json_file(args.hypotheses_path, 'hypotheses')

    output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force)

    db_dir = util.get_input_path(args.db_dir)
    db_path_list = [str(path) for path in sorted(db_dir.glob('copy*'))]
    print('Using the following tdb databases to query: {}'.format(db_path_list))

    num_node_queries = len(db_path_list)

    top_count = 0
    for result_idx, prob in sorted(
            enumerate(hypotheses_json['probs']), key=itemgetter(1), reverse=True):
        hypothesis = hypotheses_json['support'][result_idx]
        # node_query_list, stmt_query_list, just_query_list, conf_query_list = \
        sparql_query_str = \
            queries_for_aida_result(
                json_graph=json_graph,
                hypothesis=hypothesis,
                member_to_clusters=member_to_clusters,
                cluster_to_prototype=cluster_to_prototype,
                prototype_set=prototype_set,
                num_node_queries=num_node_queries,
                query_just=args.query_just,
                query_conf=args.query_conf)

        top_count += 1

        print(f'Writing queries for hypothesis #{top_count} with prob {prob}')

        sparql_query_path = output_dir / 'hypothesis-{:0>3d}-query.rq'.format(top_count)
        with open(str(sparql_query_path), 'w') as fout:
            fout.write(sparql_query_str + '\n')

        if not args.dry_run:
            query_result_path = output_dir / 'hypothesis-{:0>3d}-raw.ttl'.format(top_count)
            query_cmd = 'echo "query {0}"; tdbquery --loc {1} --query {0} > {2}; '.format(
                sparql_query_path, db_path_list[0], query_result_path)

            print('Executing queries ...')
            process = subprocess.Popen(query_cmd, shell=True)
            process.wait()

        # sparql_helper.execute_sparql_queries(
        #     node_query_list, stmt_query_list, just_query_list, conf_query_list,
        #     db_path_list, output_dir,
        #     filename_prefix='hypothesis-{:0>3d}'.format(top_count),
        #     header_prefixes=AIF_HEADER_PREFIXES, dry_run=args.dry_run)

        if top_count >= args.top:
            break
示例#10
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph json file')
    parser.add_argument('hypotheses_path',
                        help='path to the hypotheses json directory')
    parser.add_argument('kb_path', help='path to the TA2 KB file (in AIF)')
    parser.add_argument('output_dir', help='path to output directory')
    parser.add_argument('run_id', help='TA3 run ID')
    parser.add_argument('sin_id_prefix',
                        help='prefix of SIN IDs to name the final hypotheses')
    parser.add_argument('--top',
                        default=50,
                        type=int,
                        help='number of top hypothesis to output')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    graph_mappings = json_graph.build_cluster_member_mappings()

    hypotheses_file_paths = util.get_file_list(args.hypotheses_path,
                                               suffix='.json',
                                               sort=True)

    # TODO: there is a known bug in rdflib that
    #  rdflib.Literal("2008", datatype=rdflib.XSD.gYear) would be parsed into
    #  rdflib.term.Literal(u'2008-01-01', datatype=rdflib.XSD.gYear) automatically,
    #  because a `parse_date` function is invoked for all rdflib.XSD.gYear literals.
    #  This is a temporary workaround to patch the _toPythonMapping locally.
    #  c.f.: https://github.com/RDFLib/rdflib/issues/806
    # noinspection PyProtectedMember
    rdflib.term._toPythonMapping.pop(rdflib.XSD['gYear'])

    print('Reading kb from {}'.format(args.kb_path))
    kb_graph = Graph()
    kb_graph.parse(args.kb_path, format='ttl')

    kb_nodes_by_category = catalogue_kb_nodes(kb_graph)

    kb_stmt_key_mapping = index_statement_nodes(
        kb_graph, kb_nodes_by_category['Statement'])
    kb_cm_key_mapping = index_cluster_membership_nodes(
        kb_graph, kb_nodes_by_category['ClusterMembership'])
    kb_type_stmt_key_mapping = index_type_statement_nodes(
        kb_graph, kb_nodes_by_category['TypeStatement'])

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    run_id = args.run_id
    sin_id_prefix = args.sin_id_prefix

    for hypotheses_file_path in hypotheses_file_paths:
        hypotheses_json = util.read_json_file(hypotheses_file_path,
                                              'hypotheses')

        print('Found {} hypotheses with probability {}'.format(
            len(hypotheses_json['probs']), hypotheses_json['probs']))

        soin_id = sin_id_prefix + '_' + hypotheses_file_path.stem.split('_')[0]
        frame_id = soin_id + '_F1'

        top_count = 0
        for hypothesis_idx, prob in sorted(enumerate(hypotheses_json['probs']),
                                           key=itemgetter(1),
                                           reverse=True):
            if prob <= 0.0:
                hypothesis_weight = math.exp(prob / 2.0)
            else:
                hypothesis_weight = 0.0001

            hypothesis = hypotheses_json['support'][hypothesis_idx]

            top_count += 1
            hypothesis_id = '{}_hypothesis_{:0>3d}'.format(frame_id, top_count)

            subgraph = build_subgraph_for_hypothesis(
                kb_graph=kb_graph,
                kb_nodes_by_category=kb_nodes_by_category,
                kb_stmt_key_mapping=kb_stmt_key_mapping,
                kb_cm_key_mapping=kb_cm_key_mapping,
                kb_type_stmt_key_mapping=kb_type_stmt_key_mapping,
                json_graph=json_graph,
                graph_mappings=graph_mappings,
                hypothesis=hypothesis,
                hypothesis_id=hypothesis_id,
                hypothesis_weight=hypothesis_weight)

            output_path = output_dir / '{}.{}.{}.H{:0>3d}.ttl'.format(
                run_id, soin_id, frame_id, top_count)
            print('Writing hypothesis #{:>2d} with prob {:>6.2f} to {}'.format(
                top_count, prob, output_path))
            with open(output_path, 'w') as fout:
                fout.write(print_graph(subgraph))

            if top_count >= args.top:
                break
示例#11
0
文件: stats.py 项目: pxch/aida-utexas
def get_json_stats(json_graph: JsonGraph):
    eres = []
    singleton_eres = []
    entities = []
    singleton_entities = []
    relations = []
    singleton_relations = []
    events = []
    singleton_events = []
    stmts = []
    type_stmts = []
    clusters = []
    cluster_memberships = []
    prototypes = []
    ere_to_memberships = defaultdict(set)
    ere_to_clusters = defaultdict(set)

    for node_label, node in json_graph.node_dict.items():
        if json_graph.is_ere(node_label):
            eres.append(node_label)

            is_singleton = True
            for stmt_label in json_graph.each_ere_adjacent_stmt(node_label):
                if not json_graph.is_type_stmt(stmt_label):
                    is_singleton = False
                    break
            if is_singleton:
                singleton_eres.append(node_label)

            if json_graph.is_entity(node_label):
                entities.append(node_label)
                if is_singleton:
                    singleton_entities.append(node_label)
            if json_graph.is_relation(node_label):
                relations.append(node_label)
                if is_singleton:
                    singleton_relations.append(node_label)
            if json_graph.is_event(node_label):
                events.append(node_label)
                if is_singleton:
                    singleton_events.append(node_label)

        if json_graph.is_statement(node_label):
            stmts.append(node_label)
        if json_graph.is_type_stmt(node_label):
            type_stmts.append(node_label)

        if node.type == 'SameAsCluster':
            clusters.append(node_label)
            prototypes.append(node.prototype)
            ere_to_clusters[node.prototype].add(node_label)

        if node.type == 'ClusterMembership':
            cluster_memberships.append(node_label)
            clusters.append(node.cluster)
            ere_to_clusters[node.clusterMember].add(node.cluster)
            ere_to_memberships[node.clusterMember].add(node_label)

    print(f'# Nodes: {len(json_graph.node_dict)}')
    print(f'# EREs: {len(eres)} ({len(singleton_eres)} are singleton)')
    print(f'# Entities: {len(entities)} ({len(singleton_entities)} are singleton)')
    print(f'# Relations: {len(relations)} ({len(singleton_relations)} are singleton)')
    print(f'# Events: {len(events)} ({len(singleton_events)} are singleton)')
    print(f'# Statements: {len(stmts)}')
    print(f'# Type Statements: {len(type_stmts)}')
    print(f'# SameAsClusters: {len(clusters)}')
    print(f'# ClusterMemberships: {len(cluster_memberships)}')
    print(f'# Prototype EREs: {len(prototypes)}')

    num_clusters_per_ere = [len(val) for val in ere_to_clusters.values()]
    print(f'# Clusters per ERE: min = {min(num_clusters_per_ere)}, '
          f'max = {max(num_clusters_per_ere)}, '
          f'mean = {sum(num_clusters_per_ere) / len(num_clusters_per_ere)}')

    num_memberships_per_ere = [len(val) for val in ere_to_memberships.values()]
    print(f'# Memberships per ERE: min = {min(num_memberships_per_ere)}, '
          f'max = {max(num_memberships_per_ere)}, '
          f'mean = {sum(num_memberships_per_ere) / len(num_memberships_per_ere)}')
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph json file')
    parser.add_argument('hypotheses_path',
                        help='path to the hypotheses json file')
    parser.add_argument('output_dir', help='Directory to write queries')
    parser.add_argument('frame_id', help='Frame ID of the hypotheses')
    parser.add_argument('--top',
                        default=50,
                        type=int,
                        help='number of top hypothesis to output')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))
    mappings = json_graph.build_cluster_member_mappings()

    hypotheses_json = util.read_json_file(args.hypotheses_path, 'hypotheses')

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)
    frame_id = args.frame_id

    top_count = 0

    for result_idx, prob in sorted(enumerate(hypotheses_json['probs']),
                                   key=itemgetter(1),
                                   reverse=True):
        if prob <= 0.0:
            hyp_weight = math.exp(prob / 2.0)
        else:
            hyp_weight = 0.0001

        hypothesis = hypotheses_json['support'][result_idx]

        top_count += 1

        hypothesis_id = '{}_hypothesis_{:0>3d}'.format(frame_id, top_count)

        hypothesis_name = 'utexas:{}'.format(hypothesis_id)
        subgraph_name = hypothesis_name + '_subgraph'

        update_query_count = 0

        # Build an update query to add aida:Hypothesis and its importance values, as well as
        # the importance values for all event and relation clusters.
        update_str = update_prefix + 'INSERT DATA\n{\n'
        update_str += '  {} a aida:Hypothesis .\n'.format(hypothesis_name)
        update_str += '  {} aida:importance "{:.4f}"^^xsd:double .\n'.format(
            hypothesis_name, hyp_weight)
        update_str += '  {} aida:hypothesisContent {} .\n'.format(
            hypothesis_name, subgraph_name)
        update_str += '  {} a aida:Subgraph .\n'.format(subgraph_name)

        stmt_importance, node_importance = compute_importance_mapping(
            json_graph,
            hypothesis,
            member_to_clusters=mappings['member_to_clusters'],
            cluster_to_prototype=mappings['cluster_to_prototype'])

        for node_id, importance_value in node_importance.items():
            update_str += '  <{}> aida:importance "{:.4f}"^^xsd:double .\n'.format(
                node_id, importance_value)

        update_str += '}'

        output_path = output_dir / 'hypothesis-{:0>3d}-update-{:0>4d}.rq'.format(
            top_count, update_query_count)

        with open(str(output_path), 'w') as fout:
            fout.write(update_str)

        update_query_count += 1

        # Build an update query for the aida:subgraphContains field of the aida:Subgraph node as
        # the aida:hypothesisContent. We just include all ERE nodes for simplicity, as it's not
        # required that all KEs should be included for NIST to evaluate in M18.
        update_str = update_prefix
        update_str += \
            'INSERT {{\n' \
            '{} aida:subgraphContains ?e .\n' \
            '}}\nWHERE\n{{\n' \
            '{{ ?e a aida:Entity }}\nUNION\n' \
            '{{ ?e a aida:Relation }}\nUNION\n' \
            '{{ ?e a aida:Event }}\n}}\n'.format(subgraph_name)

        output_path = output_dir / 'hypothesis-{:0>3d}-update-{:0>4d}.rq'.format(
            top_count, update_query_count)
        with open(str(output_path), 'w') as fout:
            fout.write(update_str)

        update_query_count += 1

        # Build an update query for the importance value of each statement. We would need
        # a separate query for each statement, because we need to use the INSERT {} WHERE {}
        # operator here to allow BNode statements.
        for (stmt_subj, stmt_pred,
             stmt_obj), importance_value in stmt_importance.items():
            update_str = update_prefix
            update_str += \
                'INSERT {{ ?x aida:importance "{:.4f}"^^xsd:double . }}\n' \
                'WHERE\n{{\n' \
                '?x a rdf:Statement .\n' \
                '?x rdf:subject <{}> .\n' \
                '?x rdf:predicate ldcOnt:{} .\n' \
                '?x rdf:object <{}> .\n}}\n'.format(
                    importance_value, stmt_subj, stmt_pred, stmt_obj)

            output_path = output_dir / 'hypothesis-{:0>3d}-update-{:0>4d}.rq'.format(
                top_count, update_query_count)

            with open(str(output_path), 'w') as fout:
                fout.write(update_str)

            update_query_count += 1

        if top_count >= args.top:
            break
示例#13
0
def main():
    parser = ArgumentParser()
    # required positional
    parser.add_argument('graph_path', help='path to the graph JSON file')
    parser.add_argument('hypothesis_path',
                        help='path to the JSON file with hypotheses')
    parser.add_argument("roles_ontology_path", help="path to roles ontology")

    args = parser.parse_args()

    print("Reading in data...")

    # read KB
    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    # read hypotheses
    hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses')
    hypothesis_collection = AidaHypothesisCollection.from_json(
        hypotheses_json, json_graph)

    # read roles ontology
    roles_ontology = util.read_json_file(args.roles_ontology_path,
                                         'roles ontology')

    # determine all question IDs
    questionIDs = set()
    for h in hypothesis_collection:
        questionIDs.update(h.questionIDs)

    choice = question_id = restrict_core_role = restrict_core_ere = None
    while choice != "x":
        # determine core choice
        print("question IDs:", ", ".join(questionIDs))
        print("Choose from:")
        print("c: core hypothesis display")
        print("e: show events/relations connected to an ere")
        print("r: show events/relations connected to a role filler")
        print("se: survey context of an ERE independent of hypotheses")
        print("sr: survey context of a role filler independent of hypotheses")
        print("p: print hypotheses for a particular question ID")
        print(
            "R: restrict hypotheses to be considered going forward, for the rest of the run"
        )
        print("x: exit")

        choice = input()

        # determine additional restrictions on hypotheses to consider
        if choice in ["c", "e", "r", "p"]:
            question_id = input("Question ID: ")

            # filter hypotheses by question ID
            this_hypothesis_collection = filter_hypotheses_by_question(
                hypothesis_collection, question_id)

            # additionally filter by a core role filler?
            restrict_core_role = input("Optional core role to restrict: ")
            if restrict_core_role != "":
                restrict_core_ere = input(
                    "Value to restrict the core role to (ERE ID): ")

                this_hypothesis_collection = filter_hypotheses_by_entrypoints(
                    this_hypothesis_collection, json_graph, restrict_core_role,
                    restrict_core_ere)

        # execute choice
        if choice == "c":
            show_core(json_graph, this_hypothesis_collection)
        elif choice == "e":
            show_ere(json_graph, this_hypothesis_collection, roles_ontology)
        elif choice == "r":
            show_rolefiller(json_graph, this_hypothesis_collection,
                            roles_ontology)
        elif choice == "se":
            show_ere_graphenv(json_graph, roles_ontology)
        elif choice == "sr":
            show_role_graphenv(json_graph, this_hypothesis_collection,
                               roles_ontology)
        elif choice == "R":
            restrict_core_role = input("Core role to restrict: ")
            restrict_core_ere = input(
                "Value to restrict the core role to (ERE ID): ")

            hypothesis_collection = filter_hypotheses_by_entrypoints(
                hypothesis_collection, json_graph, restrict_core_role,
                restrict_core_ere)
        elif choice == "p":
            print_hypotheses(json_graph, hypothesis_collection, roles_ontology)
示例#14
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph JSON file')
    parser.add_argument('hypothesis_path', help='path to the JSON file with hypotheses')
    parser.add_argument('roles_ontology_path', help='path to the roles ontology file')
    parser.add_argument('output_dir', help='directory to write human-readable hypotheses')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph'))

    hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses')
    hypothesis_collection = AidaHypothesisCollection.from_json(hypotheses_json, json_graph)

    roles_ontology = util.read_json_file(args.roles_ontology_path, 'roles ontology')

    output_dir = util.get_output_dir(args.output_dir, overwrite_warning=True)

    output_list = []
    for hypo_idx, hypothesis in enumerate(hypothesis_collection.hypotheses):
        output_path = output_dir / 'hypothesis-{:0>3d}.txt'.format(hypo_idx)
        result, _ = hypothesis.to_str_for_csv(roles_ontology)
        with open(str(output_path), "w", encoding="utf-8") as fout:
            print(result, file=fout)
        result = result.replace(',', ' &').replace('ID: ', '')
        result_list = result.replace('\n    ', ',').split('\n\n')
        for ere_idx, res in enumerate(result_list):
            tmp_res_list = res.split(',')
            if res:
                if len(tmp_res_list[1]) < 2 or tmp_res_list[1][:2] not in 'T1T2T3T4':
                    tmp_res_list.insert(1, '')
                for _ in range(9 - len(tmp_res_list)):
                    tmp_res_list.insert(-1, '')
                for idx, tmp_res in enumerate(tmp_res_list):
                    if len(tmp_res.split(': ')) == 2 and tmp_res.split(': ')[1] == '':
                        tmp_res_list[idx] = ''
                    
                for question_ID in hypothesis.questionIDs:
                    question_ID = '_'.join(question_ID.split('_')[3:]) 
                    sin_info = question_ID + '.{}.{}'.format(hypo_idx + 1, ere_idx + 1)
                    sin_info_list = sin_info.replace('.', '_').split('_')
                    sin_info_list = tuple([int(''.join([i for i in x if i.isdigit()])) for x in sin_info_list])
                    tmp_res_list2 = copy.deepcopy(tmp_res_list)
                    tmp_res_list2.insert(0, sin_info)
                    res = ','.join(tmp_res_list2)
                    output_list.append((sin_info_list, res))
 
    output_list.sort(key=lambda x : (x[0][0], x[0][2], x[0][1], x[0][3], x[0][4]))
    csv_output_path = output_dir / args.hypothesis_path.split('/')[-1].replace('json', 'csv')
    with open(csv_output_path, 'w', encoding="utf-8") as csv_file:
        csv_file.write('SIN,Event or Relation type,time,arg1,arg2,arg3,arg4,arg5,comments,ID\n')   
        prev = tuple()
        for idx, output in enumerate(output_list):
            if idx != 0 and prev[0] != output[0][0]:
                csv_file.write('\n')
            if idx != 0 and prev[1] != output[0][1]:
                csv_file.write('\n')
            if idx != 0 and prev[2] != output[0][2]:
                csv_file.write('\n')
            if idx != 0 and prev[3] != output[0][3]:
                csv_file.write('\n')
            csv_file.write(output[1] + '\n')
            prev = output[0]
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph json file')
    parser.add_argument('hypotheses_path',
                        help='path to the hypotheses json file')
    parser.add_argument('output_dir', help='Directory to write queries')
    parser.add_argument('--top',
                        default=50,
                        type=int,
                        help='number of top hypothesis to output')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))
    mappings = json_graph.build_cluster_member_mappings()

    hypotheses_json = util.read_json_file(args.hypotheses_path, 'hypotheses')

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    top_count = 0

    for result_idx, prob in sorted(enumerate(hypotheses_json['probs']),
                                   key=itemgetter(1),
                                   reverse=True):
        hypothesis = hypotheses_json['support'][result_idx]

        top_count += 1

        update_str = update_prefix + 'INSERT DATA\n{\n'

        prototype_handles = compute_handle_mapping(
            json_graph,
            hypothesis,
            member_to_clusters=mappings['member_to_clusters'],
            cluster_to_prototype=mappings['cluster_to_prototype'])

        for prototype, handle in prototype_handles.items():
            handle = handle.lstrip('"')
            handle = handle.rstrip('"')
            update_str += '  <{}> aida:handle "{}" .\n'.format(
                prototype, handle)

        update_str += '}'

        output_path = output_dir / 'hypothesis-{:0>3d}-update.rq'.format(
            top_count)

        with open(str(output_path), 'w') as fout:
            fout.write(update_str)

        if top_count >= args.top:
            break
示例#16
0
def shortest_name(ere_label: str, json_graph: JsonGraph):
    names = json_graph.english_names(json_graph.ere_names(ere_label))
    if len(names) > 0:
        return sorted(names, key=lambda n: len(n))[0]
    return None
示例#17
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='Path to the input graph JSON file')
    parser.add_argument(
        'query_path',
        help=
        'Path to the input query file, or a directory with multiple queries')
    parser.add_argument('output_dir',
                        help='Directory to write the raw hypothesis seeds')
    parser.add_argument(
        '-n',
        '--max_num_seeds_per_facet',
        type=int,
        default=None,
        help='If provided, only save up to <arg> seeds per facet')
    parser.add_argument(
        '-d',
        '--discard_failed_core_constraints',
        action='store_true',
        help='If specified, discard hypotheses with failed core constraints. '
        'Try not to use this one during evaluation at first, so that we '
        'do not discard hypotheses we might still need. If we have too many '
        'hypotheses and the script runs too slowly, then use this.')
    parser.add_argument(
        '-r',
        '--rank_cutoff',
        type=int,
        default=100,
        help=
        'If specified, discard hypotheses early if there are at least <arg> '
        'other hypotheses that have the same fillers for a certain number '
        '(default = 3) of their non-entrypoint query variables. We might '
        'need this in the evaluation if some facets have many variables '
        'that lead to combinatorial explosion.')
    parser.add_argument(
        '--frame_grouping',
        action='store_true',
        help=
        'If specified, group query constraints by frames instead of by facets')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    query_file_paths = util.get_file_list(args.query_path,
                                          suffix='.json',
                                          sort=True)

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    for query_file_path in query_file_paths:
        query_json = util.read_json_file(query_file_path, 'query')

        raw_seeds_json = make_cluster_seeds(
            json_graph=json_graph,
            query_json=query_json,
            max_num_seeds_per_facet=args.max_num_seeds_per_facet,
            frame_grouping=args.frame_grouping,
            discard_failed_core_constraints=args.
            discard_failed_core_constraints,
            rank_cutoff=args.rank_cutoff)

        # write hypotheses out in json format.
        output_path = output_dir / (query_file_path.name.split('_')[0] +
                                    '_seeds.json')
        logging.info(
            'Writing raw hypothesis seeds of each facet to {} ...'.format(
                output_path))
        with open(str(output_path), 'w') as fout:
            json.dump(raw_seeds_json, fout, indent=1)
示例#18
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='Path to the input graph JSON file')
    parser.add_argument(
        'raw_seeds_path',
        help='Path to the raw hypothesis seeds file, or a directory with '
        'multiple seeds files')
    parser.add_argument(
        'output_dir', help='Directory to write the reranked hypothesis seeds')
    parser.add_argument('--plausibility_model_path',
                        help='Path to a hypothesis plausibility model')
    parser.add_argument('--indexer_path', help="Path to the indexers file")
    parser.add_argument('-n',
                        '--max_num_seeds',
                        type=int,
                        default=None,
                        help='Only output up to n hypothesis seeds')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    raw_seeds_file_paths = util.get_file_list(args.raw_seeds_path,
                                              suffix='.json',
                                              sort=True)

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    for raw_seeds_file_path in raw_seeds_file_paths:
        raw_seeds_json = util.read_json_file(raw_seeds_file_path,
                                             'seeds by facet')
        seeds_by_facet = {}
        for facet_label, seeds_json in raw_seeds_json.items():
            if facet_label != 'graph':
                seeds_by_facet[facet_label] = [
                    HypothesisSeed.from_json(seed_json, json_graph)
                    for seed_json in seeds_json
                ]

        if args.plausibility_model_path is not None and args.indexer_path is not None:
            seeds_by_facet = rerank_seeds_by_plausibility(
                seeds_by_facet, args.graph_path, args.plausibility_model_path,
                args.indexer_path)

        seeds = select_seeds_by_novelty(seeds_by_facet, args.max_num_seeds)

        hypotheses_to_export = []

        # turn ranks into the log weights of seed hypotheses
        # meaningless numbers. just assign 1/2, 1/3, 1/4, ...
        for rank, seed in enumerate(seeds):
            seed.hypothesis.update_weight(math.log(1.0 / (rank + 1)))
            hypotheses_to_export.append(seed.finalize())

        hypothesis_collection = AidaHypothesisCollection(hypotheses_to_export)

        seeds_json = hypothesis_collection.to_json()
        seeds_json['graph'] = raw_seeds_json['graph']

        output_path = output_dir / (raw_seeds_file_path.name.split('_')[0] +
                                    '_seeds.json')
        logging.info(
            'Writing re-ranked hypothesis seeds to {} ...'.format(output_path))
        with open(str(output_path), 'w') as fout:
            json.dump(seeds_json, fout, indent=1)
示例#19
0
def main():
    parser = ArgumentParser(
        description=
        'Read in a TA2 KB and a (list of) XML-based Statement of Information Need '
        'definition, convert the KB to JSON format, then convert each SoIN to a JSON '
        'query by identifying and ranking entry points.')
    parser.add_argument('kb_path', help='Path to the input TA2 KB')
    parser.add_argument('graph_output_path',
                        help='Path to write the JSON graph')
    parser.add_argument(
        '-s',
        '--soin_path',
        help=
        'Path to the input SoIN file, or a directory containing multiple SoIN '
        'files; if not provided, will only transform the graph')
    parser.add_argument(
        '-q',
        '--query_output_dir',
        help=
        'Directory to write the JSON queries, used when soin_path is provided')
    parser.add_argument(
        '-m',
        '--max_matches',
        type=int,
        default=50,
        help='The maximum number of EPs *per entry point description*')
    parser.add_argument(
        '-d',
        '--dup_kb',
        default=duplicate_kb_file,
        help='Path to the json file with duplicate KB ID mappings')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    kb_path = util.get_input_path(args.kb_path)
    graph_output_path = util.get_output_path(args.graph_output_path,
                                             overwrite_warning=not args.force)

    aida_graph = AidaGraph()
    aida_graph.build_graph(str(kb_path), fmt='ttl')

    json_graph = JsonGraph()
    json_graph.build_graph(aida_graph)

    logging.info('Writing JSON graph to {} ...'.format(graph_output_path))
    with open(str(graph_output_path), 'w') as fout:
        json.dump(json_graph.as_dict(), fout, indent=1)
    logging.info('Done.')

    if args.soin_path is not None:
        assert args.query_output_dir is not None, 'Must provide query_output_dir'
        soin_path = util.get_input_path(args.soin_path)
        query_output_dir = util.get_output_dir(
            args.query_output_dir, overwrite_warning=not args.force)

        soin_file_paths = util.get_file_list(soin_path,
                                             suffix='.xml',
                                             sort=True)

        dup_kb_id_mapping = None
        if args.dup_kb is not None:
            dup_kb_id_mapping = util.read_json_file(args.dup_kb,
                                                    'duplicate KB ID mapping')

        logging.info('Getting Cluster Mappings ...')
        ere_to_prototypes = get_cluster_mappings(aida_graph)

        for soin_file_path in soin_file_paths:
            query_output_path = query_output_dir / (soin_file_path.stem +
                                                    '_query.json')

            logging.info('Processing SOIN {} ...'.format(soin_file_path))
            soin = SOIN.parse(str(soin_file_path),
                              dup_kbid_mapping=dup_kb_id_mapping)

            logging.info('Resolving all entrypoints ...')
            soin.resolve(aida_graph,
                         ere_to_prototypes,
                         max_matches=args.max_matches)

            query_json = {'graph': kb_path.stem}
            query_json.update(soin.to_json())

            logging.info(
                'Writing JSON query to {} ...'.format(query_output_path))
            with open(str(query_output_path), 'w') as fout:
                json.dump(query_json, fout, indent=1)