def main(): parser = ArgumentParser() parser.add_argument('input_graph_path', help='path to the input graph json file') parser.add_argument('output_graph_path', help='path to write the coref-compressed graph') parser.add_argument('output_log_path', help='path to write the log file') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() output_graph_path = util.get_output_path(args.output_graph_path, overwrite_warning=not args.force) output_log_path = util.get_output_path(args.output_log_path, overwrite_warning=not args.force) input_json_graph = JsonGraph.from_dict( util.read_json_file(args.input_graph_path, 'JSON graph')) num_old_eres = len(list(input_json_graph.each_ere())) assert num_old_eres == len(input_json_graph.eres) num_old_stmts = len(list(input_json_graph.each_statement())) logging.info( 'Found {} EREs and {} statements in the original graph'.format( num_old_eres, num_old_stmts)) mappings = build_mappings(input_json_graph) output_json_graph = JsonGraph() num_new_eres = compress_eres(input_json_graph, mappings, output_json_graph) num_new_stmts = compress_statements(input_json_graph, mappings, output_json_graph) logging.info( 'Finished coref-compressed graph with {} EREs and {} statements'. format(num_new_eres, num_new_stmts)) logging.info( 'Writing compressed json graph to {}'.format(output_graph_path)) with open(str(output_graph_path), 'w') as fout: json.dump(output_json_graph.as_dict(), fout, indent=1) log_json = {} for mapping_key, mapping in mappings.items(): if 'key' in mapping_key: continue if mapping_key.endswith('s'): log_json[mapping_key] = {k: list(v) for k, v in mapping.items()} else: log_json[mapping_key] = mapping logging.info('Writing compression log to {}'.format(output_log_path)) with open(str(output_log_path), 'w') as fout: json.dump(log_json, fout, indent=2)
def main(): parser = ArgumentParser() # required positional parser.add_argument('graph_path', help='path to the graph JSON file') parser.add_argument('hypothesis_path', help='path to the JSON file with hypotheses') args = parser.parse_args() # read KB json_graph = JsonGraph.from_dict( util.read_json_file(args.graph_path, 'JSON graph')) # read hypotheses hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses') hypothesis_collection = AidaHypothesisCollection.from_json( hypotheses_json, json_graph) analysis_obj = defaultdict(list) for hyp in hypothesis_collection: analysis_obj = hyp_stats(hyp, analysis_obj, json_graph) # for idx in range(len(analysis_obj["stmts"])): # print("-----------Hypothesis", idx, "-------") # for key, val in analysis_obj.items(): # print(key, ":", val[idx]) print("================ Overall =============") for key, val in analysis_obj.items(): print(key, round(sum(val) / len(val), 2))
def build_mappings(json_graph: JsonGraph): # Build mappings among clusters, members, and prototypes mappings = json_graph.build_cluster_member_mappings() # Build mappings from old statement labels to new statement labels stmt_count = 0 stmt_key_to_new_stmt = {} new_stmt_to_stmt_key = {} old_stmt_to_new_stmts = defaultdict(set) new_stmt_to_old_stmts = defaultdict(set) for node_label, node in json_graph.node_dict.items(): if node.type == 'Statement': stmt_keys = make_stmt_keys( stmt_entry=node, member_to_prototypes=mappings['member_to_prototypes']) for stmt_key in stmt_keys: if stmt_key not in stmt_key_to_new_stmt: new_stmt_label = 'Statement-{}'.format(stmt_count) stmt_count += 1 stmt_key_to_new_stmt[stmt_key] = new_stmt_label new_stmt_to_stmt_key[new_stmt_label] = stmt_key else: new_stmt_label = stmt_key_to_new_stmt[stmt_key] old_stmt_to_new_stmts[node_label].add(new_stmt_label) new_stmt_to_old_stmts[new_stmt_label].add(node_label) num_old_stmts = len(old_stmt_to_new_stmts) num_new_stmts = len(new_stmt_to_old_stmts) assert len(stmt_key_to_new_stmt) == num_new_stmts assert len(new_stmt_to_stmt_key) == num_new_stmts print('\nConstructed mapping from {} old statements to {} new statements'. format(num_old_stmts, num_new_stmts)) new_stmts_per_old_stmt_counter = Counter( [len(v) for v in old_stmt_to_new_stmts.values()]) for key in sorted(new_stmts_per_old_stmt_counter.keys()): if key > 1: print( '\tFor {} out of {} old statements, each is mapped to {} new statements' .format(new_stmts_per_old_stmt_counter[key], num_old_stmts, key)) mappings.update({ 'stmt_key_to_new_stmt': stmt_key_to_new_stmt, 'new_stmt_to_stmt_key': new_stmt_to_stmt_key, 'old_stmt_to_new_stmts': old_stmt_to_new_stmts, 'new_stmt_to_old_stmts': new_stmt_to_old_stmts }) return mappings
def compress_statements(input_json_graph: JsonGraph, mappings: Dict, output_json_graph: JsonGraph): logging.info('Building statement entries for the compressed graph ...') assert len(output_json_graph.statements) == 0 num_new_stmts = 0 for new_stmt, stmt_key in mappings['new_stmt_to_stmt_key'].items(): stmt_idx = int(new_stmt.split('-')[1]) subj, pred, obj = stmt_key new_entry = { 'type': 'Statement', 'index': stmt_idx, 'subject': subj, 'predicate': pred, 'object': obj } conf_levels = set() for old_stmt in mappings['new_stmt_to_old_stmts'][new_stmt]: old_stmt_entry = input_json_graph.node_dict[old_stmt] if old_stmt_entry.conf is not None: conf_levels.add(old_stmt_entry.conf) new_entry['conf'] = max(conf_levels) if conf_levels else None # old_stmt_entry_list = [input_json_graph.node_dict[old_stmt] # for old_stmt in mappings['new_stmt_to_old_stmts'][new_stmt]] # # # Resolve the extra information (source and hypotheses) of the new # # statement # for label in ['source', 'hypotheses_supported', 'hypotheses_partially_supported', # 'hypotheses_contradicted']: # label_value_set = set() # for old_stmt_entry in old_stmt_entry_list: # if label in old_stmt_entry: # label_value_set.update(old_stmt_entry[label]) # if len(label_value_set) > 0: # new_entry[label] = list(label_value_set) output_json_graph.node_dict[new_stmt] = StatementNode(**new_entry) output_json_graph.statements.append(new_stmt) num_new_stmts += 1 return num_new_stmts
def main(): parser = ArgumentParser() parser.add_argument('graph_path', help='Path to the input graph JSON file') parser.add_argument('hypotheses_path', help='Path to the raw hypotheses file, or a directory with multiple files') parser.add_argument('output_dir', help='Directory to write the filtered hypothesis files(s)') parser.add_argument('-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force) json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph')) hypotheses_file_paths = util.get_file_list(args.hypotheses_path, suffix='.json', sort=True) for hypotheses_file_path in hypotheses_file_paths: hypotheses_json = util.read_json_file(hypotheses_file_path, 'hypotheses') hypothesis_collection = AidaHypothesisCollection.from_json(hypotheses_json, json_graph) hypothesis_collection.expand() # create the filter hypothesis_filter = AidaHypothesisFilter(json_graph) filtered_hyplist = [hypothesis_filter.filtered(hypothesis) for hypothesis in hypothesis_collection\ if not hypothesis_too_short(hypothesis, json_graph)] filtered_hypothesis_collection = AidaHypothesisCollection(compactify(filtered_hyplist, json_graph)) filtered_hypotheses_json = filtered_hypothesis_collection.to_json() # add graph filename and queries, if they were there before if 'graph' in hypotheses_json: filtered_hypotheses_json['graph'] = hypotheses_json['graph'] if "queries" in hypotheses_json: filtered_hypotheses_json['queries'] = hypotheses_json['queries'] output_path = output_dir / hypotheses_file_path.name logging.info('Writing filtered hypotheses to {} ...'.format(output_path)) with open(str(output_path), 'w') as fout: json.dump(filtered_hypotheses_json, fout, indent=1)
def main(): parser = argparse.ArgumentParser() parser.add_argument('input_path', help='Path to a TA2 KB or a JSON graph') args = parser.parse_args() input_path = util.get_input_path(args.input_path) if input_path.suffix == '.ttl': aida_graph = AidaGraph() aida_graph.build_graph(str(input_path), fmt='ttl') get_kb_stats(aida_graph) elif input_path.suffix == '.json': with open(input_path, 'r') as fin: json_graph = JsonGraph.from_dict(json.load(fin)) get_json_stats(json_graph)
def main(): parser = ArgumentParser() parser.add_argument('graph_path', help='path to the graph JSON file') parser.add_argument('hypothesis_path', help='path to the JSON file with hypotheses') parser.add_argument('roles_ontology_path', help='path to the roles ontology file') parser.add_argument('output_dir', help='directory to write human-readable hypotheses') args = parser.parse_args() json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph')) hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses') hypothesis_collection = AidaHypothesisCollection.from_json(hypotheses_json, json_graph) roles_ontology = util.read_json_file(args.roles_ontology_path, 'roles ontology') output_dir = util.get_output_dir(args.output_dir, overwrite_warning=True) for idx, hypothesis in enumerate(hypothesis_collection.hypotheses): output_path = output_dir / 'hypothesis-{:0>3d}.txt'.format(idx) with open(str(output_path), "w", encoding="utf-8") as fout: print(hypothesis.to_str(roles_ontology), file=fout)
def compress_eres(input_json_graph: JsonGraph, mappings: Dict, output_json_graph: JsonGraph): assert len(output_json_graph.eres) == 0 logging.info( 'Building ERE / SameAsCluster / ClusterMembership entries for the compressed ' 'graph ...') num_new_eres = 0 for prototype, members in mappings['prototype_to_members'].items(): old_entry = input_json_graph.node_dict[prototype] # Use the same ERE index from the original graph new_entry = {'index': old_entry.index} member_entry_list = [ input_json_graph.node_dict[member] for member in members ] # Resolve the type of the compressed ERE node type_set = set(member_entry.type for member_entry in member_entry_list) # if len(type_set) > 1: # type_set.remove('Entity') if len(type_set) > 1: logging.warning( 'Error: multiple types {} from the following EREs {}'.format( type_set, members)) new_entry['type'] = type_set.pop() # Resolve the adjacent statements of the compressed ERE node adjacency_set = set() for member_entry in member_entry_list: for old_stmt in member_entry.adjacent: adjacency_set.update( mappings['old_stmt_to_new_stmts'][old_stmt]) new_entry['adjacent'] = list(adjacency_set) # Resolve the names of the compressed ERE node name_set = set() for member_entry in member_entry_list: name_set.update(member_entry.name) for cluster in mappings['prototype_to_clusters'][prototype]: cluster_handle = input_json_graph.node_dict[cluster].handle if cluster_handle is not None and cluster_handle != '[unknown]': name_set.add(cluster_handle) new_entry['name'] = list(name_set) # Resolve the LDC time list of the compressed ERE node ldc_time_list = [] for member_entry in member_entry_list: ldc_time_list.extend(member_entry.ldcTime) new_entry['ldcTime'] = ldc_time_list output_json_graph.node_dict[prototype] = ERENode(**new_entry) output_json_graph.eres.append(prototype) # Add SameAsCluster nodes and ClusterMembership nodes for cluster in mappings['prototype_to_clusters'][prototype]: output_json_graph.node_dict[cluster] = deepcopy( input_json_graph.node_dict[cluster]) for cluster_membership_key in \ mappings['cluster_membership_key_mapping'][(cluster, prototype)]: output_json_graph.node_dict[cluster_membership_key] = deepcopy( input_json_graph.node_dict[cluster_membership_key]) num_new_eres += 1 return num_new_eres
def main(): parser = ArgumentParser() parser.add_argument('graph_path', help='path to the graph json file') parser.add_argument('hypotheses_path', help='path to the hypotheses json file') parser.add_argument('db_dir', help='directory with copies of tdb databases') parser.add_argument('output_dir', help='path to output directory') parser.add_argument('--top', default=50, type=int, help='number of top hypothesis to output') parser.add_argument('--dry_run', action='store_true', help='if specified, only write the SPARQL queries to ' 'files, without actually executing the queries') parser.add_argument('--query_just', action='store_true') parser.add_argument('--query_conf', action='store_true') parser.add_argument('-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph')) mappings = json_graph.build_cluster_member_mappings() member_to_clusters = mappings['member_to_clusters'] cluster_to_prototype = mappings['cluster_to_prototype'] prototype_set = set(mappings['prototype_to_clusters'].keys()) hypotheses_json = util.read_json_file(args.hypotheses_path, 'hypotheses') output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force) db_dir = util.get_input_path(args.db_dir) db_path_list = [str(path) for path in sorted(db_dir.glob('copy*'))] print('Using the following tdb databases to query: {}'.format(db_path_list)) num_node_queries = len(db_path_list) top_count = 0 for result_idx, prob in sorted( enumerate(hypotheses_json['probs']), key=itemgetter(1), reverse=True): hypothesis = hypotheses_json['support'][result_idx] # node_query_list, stmt_query_list, just_query_list, conf_query_list = \ sparql_query_str = \ queries_for_aida_result( json_graph=json_graph, hypothesis=hypothesis, member_to_clusters=member_to_clusters, cluster_to_prototype=cluster_to_prototype, prototype_set=prototype_set, num_node_queries=num_node_queries, query_just=args.query_just, query_conf=args.query_conf) top_count += 1 print(f'Writing queries for hypothesis #{top_count} with prob {prob}') sparql_query_path = output_dir / 'hypothesis-{:0>3d}-query.rq'.format(top_count) with open(str(sparql_query_path), 'w') as fout: fout.write(sparql_query_str + '\n') if not args.dry_run: query_result_path = output_dir / 'hypothesis-{:0>3d}-raw.ttl'.format(top_count) query_cmd = 'echo "query {0}"; tdbquery --loc {1} --query {0} > {2}; '.format( sparql_query_path, db_path_list[0], query_result_path) print('Executing queries ...') process = subprocess.Popen(query_cmd, shell=True) process.wait() # sparql_helper.execute_sparql_queries( # node_query_list, stmt_query_list, just_query_list, conf_query_list, # db_path_list, output_dir, # filename_prefix='hypothesis-{:0>3d}'.format(top_count), # header_prefixes=AIF_HEADER_PREFIXES, dry_run=args.dry_run) if top_count >= args.top: break
def main(): parser = ArgumentParser() parser.add_argument('graph_path', help='path to the graph json file') parser.add_argument('hypotheses_path', help='path to the hypotheses json directory') parser.add_argument('kb_path', help='path to the TA2 KB file (in AIF)') parser.add_argument('output_dir', help='path to output directory') parser.add_argument('run_id', help='TA3 run ID') parser.add_argument('sin_id_prefix', help='prefix of SIN IDs to name the final hypotheses') parser.add_argument('--top', default=50, type=int, help='number of top hypothesis to output') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() json_graph = JsonGraph.from_dict( util.read_json_file(args.graph_path, 'JSON graph')) graph_mappings = json_graph.build_cluster_member_mappings() hypotheses_file_paths = util.get_file_list(args.hypotheses_path, suffix='.json', sort=True) # TODO: there is a known bug in rdflib that # rdflib.Literal("2008", datatype=rdflib.XSD.gYear) would be parsed into # rdflib.term.Literal(u'2008-01-01', datatype=rdflib.XSD.gYear) automatically, # because a `parse_date` function is invoked for all rdflib.XSD.gYear literals. # This is a temporary workaround to patch the _toPythonMapping locally. # c.f.: https://github.com/RDFLib/rdflib/issues/806 # noinspection PyProtectedMember rdflib.term._toPythonMapping.pop(rdflib.XSD['gYear']) print('Reading kb from {}'.format(args.kb_path)) kb_graph = Graph() kb_graph.parse(args.kb_path, format='ttl') kb_nodes_by_category = catalogue_kb_nodes(kb_graph) kb_stmt_key_mapping = index_statement_nodes( kb_graph, kb_nodes_by_category['Statement']) kb_cm_key_mapping = index_cluster_membership_nodes( kb_graph, kb_nodes_by_category['ClusterMembership']) kb_type_stmt_key_mapping = index_type_statement_nodes( kb_graph, kb_nodes_by_category['TypeStatement']) output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force) run_id = args.run_id sin_id_prefix = args.sin_id_prefix for hypotheses_file_path in hypotheses_file_paths: hypotheses_json = util.read_json_file(hypotheses_file_path, 'hypotheses') print('Found {} hypotheses with probability {}'.format( len(hypotheses_json['probs']), hypotheses_json['probs'])) soin_id = sin_id_prefix + '_' + hypotheses_file_path.stem.split('_')[0] frame_id = soin_id + '_F1' top_count = 0 for hypothesis_idx, prob in sorted(enumerate(hypotheses_json['probs']), key=itemgetter(1), reverse=True): if prob <= 0.0: hypothesis_weight = math.exp(prob / 2.0) else: hypothesis_weight = 0.0001 hypothesis = hypotheses_json['support'][hypothesis_idx] top_count += 1 hypothesis_id = '{}_hypothesis_{:0>3d}'.format(frame_id, top_count) subgraph = build_subgraph_for_hypothesis( kb_graph=kb_graph, kb_nodes_by_category=kb_nodes_by_category, kb_stmt_key_mapping=kb_stmt_key_mapping, kb_cm_key_mapping=kb_cm_key_mapping, kb_type_stmt_key_mapping=kb_type_stmt_key_mapping, json_graph=json_graph, graph_mappings=graph_mappings, hypothesis=hypothesis, hypothesis_id=hypothesis_id, hypothesis_weight=hypothesis_weight) output_path = output_dir / '{}.{}.{}.H{:0>3d}.ttl'.format( run_id, soin_id, frame_id, top_count) print('Writing hypothesis #{:>2d} with prob {:>6.2f} to {}'.format( top_count, prob, output_path)) with open(output_path, 'w') as fout: fout.write(print_graph(subgraph)) if top_count >= args.top: break
def get_json_stats(json_graph: JsonGraph): eres = [] singleton_eres = [] entities = [] singleton_entities = [] relations = [] singleton_relations = [] events = [] singleton_events = [] stmts = [] type_stmts = [] clusters = [] cluster_memberships = [] prototypes = [] ere_to_memberships = defaultdict(set) ere_to_clusters = defaultdict(set) for node_label, node in json_graph.node_dict.items(): if json_graph.is_ere(node_label): eres.append(node_label) is_singleton = True for stmt_label in json_graph.each_ere_adjacent_stmt(node_label): if not json_graph.is_type_stmt(stmt_label): is_singleton = False break if is_singleton: singleton_eres.append(node_label) if json_graph.is_entity(node_label): entities.append(node_label) if is_singleton: singleton_entities.append(node_label) if json_graph.is_relation(node_label): relations.append(node_label) if is_singleton: singleton_relations.append(node_label) if json_graph.is_event(node_label): events.append(node_label) if is_singleton: singleton_events.append(node_label) if json_graph.is_statement(node_label): stmts.append(node_label) if json_graph.is_type_stmt(node_label): type_stmts.append(node_label) if node.type == 'SameAsCluster': clusters.append(node_label) prototypes.append(node.prototype) ere_to_clusters[node.prototype].add(node_label) if node.type == 'ClusterMembership': cluster_memberships.append(node_label) clusters.append(node.cluster) ere_to_clusters[node.clusterMember].add(node.cluster) ere_to_memberships[node.clusterMember].add(node_label) print(f'# Nodes: {len(json_graph.node_dict)}') print(f'# EREs: {len(eres)} ({len(singleton_eres)} are singleton)') print(f'# Entities: {len(entities)} ({len(singleton_entities)} are singleton)') print(f'# Relations: {len(relations)} ({len(singleton_relations)} are singleton)') print(f'# Events: {len(events)} ({len(singleton_events)} are singleton)') print(f'# Statements: {len(stmts)}') print(f'# Type Statements: {len(type_stmts)}') print(f'# SameAsClusters: {len(clusters)}') print(f'# ClusterMemberships: {len(cluster_memberships)}') print(f'# Prototype EREs: {len(prototypes)}') num_clusters_per_ere = [len(val) for val in ere_to_clusters.values()] print(f'# Clusters per ERE: min = {min(num_clusters_per_ere)}, ' f'max = {max(num_clusters_per_ere)}, ' f'mean = {sum(num_clusters_per_ere) / len(num_clusters_per_ere)}') num_memberships_per_ere = [len(val) for val in ere_to_memberships.values()] print(f'# Memberships per ERE: min = {min(num_memberships_per_ere)}, ' f'max = {max(num_memberships_per_ere)}, ' f'mean = {sum(num_memberships_per_ere) / len(num_memberships_per_ere)}')
def main(): parser = ArgumentParser() parser.add_argument('graph_path', help='path to the graph json file') parser.add_argument('hypotheses_path', help='path to the hypotheses json file') parser.add_argument('output_dir', help='Directory to write queries') parser.add_argument('frame_id', help='Frame ID of the hypotheses') parser.add_argument('--top', default=50, type=int, help='number of top hypothesis to output') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() json_graph = JsonGraph.from_dict( util.read_json_file(args.graph_path, 'JSON graph')) mappings = json_graph.build_cluster_member_mappings() hypotheses_json = util.read_json_file(args.hypotheses_path, 'hypotheses') output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force) frame_id = args.frame_id top_count = 0 for result_idx, prob in sorted(enumerate(hypotheses_json['probs']), key=itemgetter(1), reverse=True): if prob <= 0.0: hyp_weight = math.exp(prob / 2.0) else: hyp_weight = 0.0001 hypothesis = hypotheses_json['support'][result_idx] top_count += 1 hypothesis_id = '{}_hypothesis_{:0>3d}'.format(frame_id, top_count) hypothesis_name = 'utexas:{}'.format(hypothesis_id) subgraph_name = hypothesis_name + '_subgraph' update_query_count = 0 # Build an update query to add aida:Hypothesis and its importance values, as well as # the importance values for all event and relation clusters. update_str = update_prefix + 'INSERT DATA\n{\n' update_str += ' {} a aida:Hypothesis .\n'.format(hypothesis_name) update_str += ' {} aida:importance "{:.4f}"^^xsd:double .\n'.format( hypothesis_name, hyp_weight) update_str += ' {} aida:hypothesisContent {} .\n'.format( hypothesis_name, subgraph_name) update_str += ' {} a aida:Subgraph .\n'.format(subgraph_name) stmt_importance, node_importance = compute_importance_mapping( json_graph, hypothesis, member_to_clusters=mappings['member_to_clusters'], cluster_to_prototype=mappings['cluster_to_prototype']) for node_id, importance_value in node_importance.items(): update_str += ' <{}> aida:importance "{:.4f}"^^xsd:double .\n'.format( node_id, importance_value) update_str += '}' output_path = output_dir / 'hypothesis-{:0>3d}-update-{:0>4d}.rq'.format( top_count, update_query_count) with open(str(output_path), 'w') as fout: fout.write(update_str) update_query_count += 1 # Build an update query for the aida:subgraphContains field of the aida:Subgraph node as # the aida:hypothesisContent. We just include all ERE nodes for simplicity, as it's not # required that all KEs should be included for NIST to evaluate in M18. update_str = update_prefix update_str += \ 'INSERT {{\n' \ '{} aida:subgraphContains ?e .\n' \ '}}\nWHERE\n{{\n' \ '{{ ?e a aida:Entity }}\nUNION\n' \ '{{ ?e a aida:Relation }}\nUNION\n' \ '{{ ?e a aida:Event }}\n}}\n'.format(subgraph_name) output_path = output_dir / 'hypothesis-{:0>3d}-update-{:0>4d}.rq'.format( top_count, update_query_count) with open(str(output_path), 'w') as fout: fout.write(update_str) update_query_count += 1 # Build an update query for the importance value of each statement. We would need # a separate query for each statement, because we need to use the INSERT {} WHERE {} # operator here to allow BNode statements. for (stmt_subj, stmt_pred, stmt_obj), importance_value in stmt_importance.items(): update_str = update_prefix update_str += \ 'INSERT {{ ?x aida:importance "{:.4f}"^^xsd:double . }}\n' \ 'WHERE\n{{\n' \ '?x a rdf:Statement .\n' \ '?x rdf:subject <{}> .\n' \ '?x rdf:predicate ldcOnt:{} .\n' \ '?x rdf:object <{}> .\n}}\n'.format( importance_value, stmt_subj, stmt_pred, stmt_obj) output_path = output_dir / 'hypothesis-{:0>3d}-update-{:0>4d}.rq'.format( top_count, update_query_count) with open(str(output_path), 'w') as fout: fout.write(update_str) update_query_count += 1 if top_count >= args.top: break
def main(): parser = ArgumentParser() # required positional parser.add_argument('graph_path', help='path to the graph JSON file') parser.add_argument('hypothesis_path', help='path to the JSON file with hypotheses') parser.add_argument("roles_ontology_path", help="path to roles ontology") args = parser.parse_args() print("Reading in data...") # read KB json_graph = JsonGraph.from_dict( util.read_json_file(args.graph_path, 'JSON graph')) # read hypotheses hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses') hypothesis_collection = AidaHypothesisCollection.from_json( hypotheses_json, json_graph) # read roles ontology roles_ontology = util.read_json_file(args.roles_ontology_path, 'roles ontology') # determine all question IDs questionIDs = set() for h in hypothesis_collection: questionIDs.update(h.questionIDs) choice = question_id = restrict_core_role = restrict_core_ere = None while choice != "x": # determine core choice print("question IDs:", ", ".join(questionIDs)) print("Choose from:") print("c: core hypothesis display") print("e: show events/relations connected to an ere") print("r: show events/relations connected to a role filler") print("se: survey context of an ERE independent of hypotheses") print("sr: survey context of a role filler independent of hypotheses") print("p: print hypotheses for a particular question ID") print( "R: restrict hypotheses to be considered going forward, for the rest of the run" ) print("x: exit") choice = input() # determine additional restrictions on hypotheses to consider if choice in ["c", "e", "r", "p"]: question_id = input("Question ID: ") # filter hypotheses by question ID this_hypothesis_collection = filter_hypotheses_by_question( hypothesis_collection, question_id) # additionally filter by a core role filler? restrict_core_role = input("Optional core role to restrict: ") if restrict_core_role != "": restrict_core_ere = input( "Value to restrict the core role to (ERE ID): ") this_hypothesis_collection = filter_hypotheses_by_entrypoints( this_hypothesis_collection, json_graph, restrict_core_role, restrict_core_ere) # execute choice if choice == "c": show_core(json_graph, this_hypothesis_collection) elif choice == "e": show_ere(json_graph, this_hypothesis_collection, roles_ontology) elif choice == "r": show_rolefiller(json_graph, this_hypothesis_collection, roles_ontology) elif choice == "se": show_ere_graphenv(json_graph, roles_ontology) elif choice == "sr": show_role_graphenv(json_graph, this_hypothesis_collection, roles_ontology) elif choice == "R": restrict_core_role = input("Core role to restrict: ") restrict_core_ere = input( "Value to restrict the core role to (ERE ID): ") hypothesis_collection = filter_hypotheses_by_entrypoints( hypothesis_collection, json_graph, restrict_core_role, restrict_core_ere) elif choice == "p": print_hypotheses(json_graph, hypothesis_collection, roles_ontology)
def main(): parser = ArgumentParser() parser.add_argument('graph_path', help='path to the graph JSON file') parser.add_argument('hypothesis_path', help='path to the JSON file with hypotheses') parser.add_argument('roles_ontology_path', help='path to the roles ontology file') parser.add_argument('output_dir', help='directory to write human-readable hypotheses') args = parser.parse_args() json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph')) hypotheses_json = util.read_json_file(args.hypothesis_path, 'hypotheses') hypothesis_collection = AidaHypothesisCollection.from_json(hypotheses_json, json_graph) roles_ontology = util.read_json_file(args.roles_ontology_path, 'roles ontology') output_dir = util.get_output_dir(args.output_dir, overwrite_warning=True) output_list = [] for hypo_idx, hypothesis in enumerate(hypothesis_collection.hypotheses): output_path = output_dir / 'hypothesis-{:0>3d}.txt'.format(hypo_idx) result, _ = hypothesis.to_str_for_csv(roles_ontology) with open(str(output_path), "w", encoding="utf-8") as fout: print(result, file=fout) result = result.replace(',', ' &').replace('ID: ', '') result_list = result.replace('\n ', ',').split('\n\n') for ere_idx, res in enumerate(result_list): tmp_res_list = res.split(',') if res: if len(tmp_res_list[1]) < 2 or tmp_res_list[1][:2] not in 'T1T2T3T4': tmp_res_list.insert(1, '') for _ in range(9 - len(tmp_res_list)): tmp_res_list.insert(-1, '') for idx, tmp_res in enumerate(tmp_res_list): if len(tmp_res.split(': ')) == 2 and tmp_res.split(': ')[1] == '': tmp_res_list[idx] = '' for question_ID in hypothesis.questionIDs: question_ID = '_'.join(question_ID.split('_')[3:]) sin_info = question_ID + '.{}.{}'.format(hypo_idx + 1, ere_idx + 1) sin_info_list = sin_info.replace('.', '_').split('_') sin_info_list = tuple([int(''.join([i for i in x if i.isdigit()])) for x in sin_info_list]) tmp_res_list2 = copy.deepcopy(tmp_res_list) tmp_res_list2.insert(0, sin_info) res = ','.join(tmp_res_list2) output_list.append((sin_info_list, res)) output_list.sort(key=lambda x : (x[0][0], x[0][2], x[0][1], x[0][3], x[0][4])) csv_output_path = output_dir / args.hypothesis_path.split('/')[-1].replace('json', 'csv') with open(csv_output_path, 'w', encoding="utf-8") as csv_file: csv_file.write('SIN,Event or Relation type,time,arg1,arg2,arg3,arg4,arg5,comments,ID\n') prev = tuple() for idx, output in enumerate(output_list): if idx != 0 and prev[0] != output[0][0]: csv_file.write('\n') if idx != 0 and prev[1] != output[0][1]: csv_file.write('\n') if idx != 0 and prev[2] != output[0][2]: csv_file.write('\n') if idx != 0 and prev[3] != output[0][3]: csv_file.write('\n') csv_file.write(output[1] + '\n') prev = output[0]
def main(): parser = ArgumentParser() parser.add_argument('graph_path', help='path to the graph json file') parser.add_argument('hypotheses_path', help='path to the hypotheses json file') parser.add_argument('output_dir', help='Directory to write queries') parser.add_argument('--top', default=50, type=int, help='number of top hypothesis to output') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() json_graph = JsonGraph.from_dict( util.read_json_file(args.graph_path, 'JSON graph')) mappings = json_graph.build_cluster_member_mappings() hypotheses_json = util.read_json_file(args.hypotheses_path, 'hypotheses') output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force) top_count = 0 for result_idx, prob in sorted(enumerate(hypotheses_json['probs']), key=itemgetter(1), reverse=True): hypothesis = hypotheses_json['support'][result_idx] top_count += 1 update_str = update_prefix + 'INSERT DATA\n{\n' prototype_handles = compute_handle_mapping( json_graph, hypothesis, member_to_clusters=mappings['member_to_clusters'], cluster_to_prototype=mappings['cluster_to_prototype']) for prototype, handle in prototype_handles.items(): handle = handle.lstrip('"') handle = handle.rstrip('"') update_str += ' <{}> aida:handle "{}" .\n'.format( prototype, handle) update_str += '}' output_path = output_dir / 'hypothesis-{:0>3d}-update.rq'.format( top_count) with open(str(output_path), 'w') as fout: fout.write(update_str) if top_count >= args.top: break
def shortest_name(ere_label: str, json_graph: JsonGraph): names = json_graph.english_names(json_graph.ere_names(ere_label)) if len(names) > 0: return sorted(names, key=lambda n: len(n))[0] return None
def main(): parser = ArgumentParser() parser.add_argument('graph_path', help='Path to the input graph JSON file') parser.add_argument( 'query_path', help= 'Path to the input query file, or a directory with multiple queries') parser.add_argument('output_dir', help='Directory to write the raw hypothesis seeds') parser.add_argument( '-n', '--max_num_seeds_per_facet', type=int, default=None, help='If provided, only save up to <arg> seeds per facet') parser.add_argument( '-d', '--discard_failed_core_constraints', action='store_true', help='If specified, discard hypotheses with failed core constraints. ' 'Try not to use this one during evaluation at first, so that we ' 'do not discard hypotheses we might still need. If we have too many ' 'hypotheses and the script runs too slowly, then use this.') parser.add_argument( '-r', '--rank_cutoff', type=int, default=100, help= 'If specified, discard hypotheses early if there are at least <arg> ' 'other hypotheses that have the same fillers for a certain number ' '(default = 3) of their non-entrypoint query variables. We might ' 'need this in the evaluation if some facets have many variables ' 'that lead to combinatorial explosion.') parser.add_argument( '--frame_grouping', action='store_true', help= 'If specified, group query constraints by frames instead of by facets') parser.add_argument( '-f', '--force', action='store_true', help='If specified, overwrite existing output files without warning') args = parser.parse_args() json_graph = JsonGraph.from_dict( util.read_json_file(args.graph_path, 'JSON graph')) query_file_paths = util.get_file_list(args.query_path, suffix='.json', sort=True) output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force) for query_file_path in query_file_paths: query_json = util.read_json_file(query_file_path, 'query') raw_seeds_json = make_cluster_seeds( json_graph=json_graph, query_json=query_json, max_num_seeds_per_facet=args.max_num_seeds_per_facet, frame_grouping=args.frame_grouping, discard_failed_core_constraints=args. discard_failed_core_constraints, rank_cutoff=args.rank_cutoff) # write hypotheses out in json format. output_path = output_dir / (query_file_path.name.split('_')[0] + '_seeds.json') logging.info( 'Writing raw hypothesis seeds of each facet to {} ...'.format( output_path)) with open(str(output_path), 'w') as fout: json.dump(raw_seeds_json, fout, indent=1)
def main(): parser = ArgumentParser() parser.add_argument('graph_path', help='Path to the input graph JSON file') parser.add_argument( 'raw_seeds_path', help='Path to the raw hypothesis seeds file, or a directory with ' 'multiple seeds files') parser.add_argument( 'output_dir', help='Directory to write the reranked hypothesis seeds') parser.add_argument('--plausibility_model_path', help='Path to a hypothesis plausibility model') parser.add_argument('--indexer_path', help="Path to the indexers file") parser.add_argument('-n', '--max_num_seeds', type=int, default=None, help='Only output up to n hypothesis seeds') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() json_graph = JsonGraph.from_dict( util.read_json_file(args.graph_path, 'JSON graph')) raw_seeds_file_paths = util.get_file_list(args.raw_seeds_path, suffix='.json', sort=True) output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force) for raw_seeds_file_path in raw_seeds_file_paths: raw_seeds_json = util.read_json_file(raw_seeds_file_path, 'seeds by facet') seeds_by_facet = {} for facet_label, seeds_json in raw_seeds_json.items(): if facet_label != 'graph': seeds_by_facet[facet_label] = [ HypothesisSeed.from_json(seed_json, json_graph) for seed_json in seeds_json ] if args.plausibility_model_path is not None and args.indexer_path is not None: seeds_by_facet = rerank_seeds_by_plausibility( seeds_by_facet, args.graph_path, args.plausibility_model_path, args.indexer_path) seeds = select_seeds_by_novelty(seeds_by_facet, args.max_num_seeds) hypotheses_to_export = [] # turn ranks into the log weights of seed hypotheses # meaningless numbers. just assign 1/2, 1/3, 1/4, ... for rank, seed in enumerate(seeds): seed.hypothesis.update_weight(math.log(1.0 / (rank + 1))) hypotheses_to_export.append(seed.finalize()) hypothesis_collection = AidaHypothesisCollection(hypotheses_to_export) seeds_json = hypothesis_collection.to_json() seeds_json['graph'] = raw_seeds_json['graph'] output_path = output_dir / (raw_seeds_file_path.name.split('_')[0] + '_seeds.json') logging.info( 'Writing re-ranked hypothesis seeds to {} ...'.format(output_path)) with open(str(output_path), 'w') as fout: json.dump(seeds_json, fout, indent=1)
def main(): parser = ArgumentParser( description= 'Read in a TA2 KB and a (list of) XML-based Statement of Information Need ' 'definition, convert the KB to JSON format, then convert each SoIN to a JSON ' 'query by identifying and ranking entry points.') parser.add_argument('kb_path', help='Path to the input TA2 KB') parser.add_argument('graph_output_path', help='Path to write the JSON graph') parser.add_argument( '-s', '--soin_path', help= 'Path to the input SoIN file, or a directory containing multiple SoIN ' 'files; if not provided, will only transform the graph') parser.add_argument( '-q', '--query_output_dir', help= 'Directory to write the JSON queries, used when soin_path is provided') parser.add_argument( '-m', '--max_matches', type=int, default=50, help='The maximum number of EPs *per entry point description*') parser.add_argument( '-d', '--dup_kb', default=duplicate_kb_file, help='Path to the json file with duplicate KB ID mappings') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() kb_path = util.get_input_path(args.kb_path) graph_output_path = util.get_output_path(args.graph_output_path, overwrite_warning=not args.force) aida_graph = AidaGraph() aida_graph.build_graph(str(kb_path), fmt='ttl') json_graph = JsonGraph() json_graph.build_graph(aida_graph) logging.info('Writing JSON graph to {} ...'.format(graph_output_path)) with open(str(graph_output_path), 'w') as fout: json.dump(json_graph.as_dict(), fout, indent=1) logging.info('Done.') if args.soin_path is not None: assert args.query_output_dir is not None, 'Must provide query_output_dir' soin_path = util.get_input_path(args.soin_path) query_output_dir = util.get_output_dir( args.query_output_dir, overwrite_warning=not args.force) soin_file_paths = util.get_file_list(soin_path, suffix='.xml', sort=True) dup_kb_id_mapping = None if args.dup_kb is not None: dup_kb_id_mapping = util.read_json_file(args.dup_kb, 'duplicate KB ID mapping') logging.info('Getting Cluster Mappings ...') ere_to_prototypes = get_cluster_mappings(aida_graph) for soin_file_path in soin_file_paths: query_output_path = query_output_dir / (soin_file_path.stem + '_query.json') logging.info('Processing SOIN {} ...'.format(soin_file_path)) soin = SOIN.parse(str(soin_file_path), dup_kbid_mapping=dup_kb_id_mapping) logging.info('Resolving all entrypoints ...') soin.resolve(aida_graph, ere_to_prototypes, max_matches=args.max_matches) query_json = {'graph': kb_path.stem} query_json.update(soin.to_json()) logging.info( 'Writing JSON query to {} ...'.format(query_output_path)) with open(str(query_output_path), 'w') as fout: json.dump(query_json, fout, indent=1)