def main(): parser = ArgumentParser() parser.add_argument('graph_path', help='Path to the input graph JSON file') parser.add_argument('hypotheses_path', help='Path to the raw hypotheses file, or a directory with multiple files') parser.add_argument('output_dir', help='Directory to write the filtered hypothesis files(s)') parser.add_argument('-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force) json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph')) hypotheses_file_paths = util.get_file_list(args.hypotheses_path, suffix='.json', sort=True) for hypotheses_file_path in hypotheses_file_paths: hypotheses_json = util.read_json_file(hypotheses_file_path, 'hypotheses') hypothesis_collection = AidaHypothesisCollection.from_json(hypotheses_json, json_graph) hypothesis_collection.expand() # create the filter hypothesis_filter = AidaHypothesisFilter(json_graph) filtered_hyplist = [hypothesis_filter.filtered(hypothesis) for hypothesis in hypothesis_collection\ if not hypothesis_too_short(hypothesis, json_graph)] filtered_hypothesis_collection = AidaHypothesisCollection(compactify(filtered_hyplist, json_graph)) filtered_hypotheses_json = filtered_hypothesis_collection.to_json() # add graph filename and queries, if they were there before if 'graph' in hypotheses_json: filtered_hypotheses_json['graph'] = hypotheses_json['graph'] if "queries" in hypotheses_json: filtered_hypotheses_json['queries'] = hypotheses_json['queries'] output_path = output_dir / hypotheses_file_path.name logging.info('Writing filtered hypotheses to {} ...'.format(output_path)) with open(str(output_path), 'w') as fout: json.dump(filtered_hypotheses_json, fout, indent=1)
def main(): parser = ArgumentParser() parser.add_argument('graph_path', help='Path to the input graph JSON file') parser.add_argument( 'raw_seeds_path', help='Path to the raw hypothesis seeds file, or a directory with ' 'multiple seeds files') parser.add_argument( 'output_dir', help='Directory to write the reranked hypothesis seeds') parser.add_argument('--plausibility_model_path', help='Path to a hypothesis plausibility model') parser.add_argument('--indexer_path', help="Path to the indexers file") parser.add_argument('-n', '--max_num_seeds', type=int, default=None, help='Only output up to n hypothesis seeds') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() json_graph = JsonGraph.from_dict( util.read_json_file(args.graph_path, 'JSON graph')) raw_seeds_file_paths = util.get_file_list(args.raw_seeds_path, suffix='.json', sort=True) output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force) for raw_seeds_file_path in raw_seeds_file_paths: raw_seeds_json = util.read_json_file(raw_seeds_file_path, 'seeds by facet') seeds_by_facet = {} for facet_label, seeds_json in raw_seeds_json.items(): if facet_label != 'graph': seeds_by_facet[facet_label] = [ HypothesisSeed.from_json(seed_json, json_graph) for seed_json in seeds_json ] if args.plausibility_model_path is not None and args.indexer_path is not None: seeds_by_facet = rerank_seeds_by_plausibility( seeds_by_facet, args.graph_path, args.plausibility_model_path, args.indexer_path) seeds = select_seeds_by_novelty(seeds_by_facet, args.max_num_seeds) hypotheses_to_export = [] # turn ranks into the log weights of seed hypotheses # meaningless numbers. just assign 1/2, 1/3, 1/4, ... for rank, seed in enumerate(seeds): seed.hypothesis.update_weight(math.log(1.0 / (rank + 1))) hypotheses_to_export.append(seed.finalize()) hypothesis_collection = AidaHypothesisCollection(hypotheses_to_export) seeds_json = hypothesis_collection.to_json() seeds_json['graph'] = raw_seeds_json['graph'] output_path = output_dir / (raw_seeds_file_path.name.split('_')[0] + '_seeds.json') logging.info( 'Writing re-ranked hypothesis seeds to {} ...'.format(output_path)) with open(str(output_path), 'w') as fout: json.dump(seeds_json, fout, indent=1)
def main(): parser = ArgumentParser() parser.add_argument('graph_path', help='Path to the input graph JSON file') parser.add_argument( 'query_path', help= 'Path to the input query file, or a directory with multiple queries') parser.add_argument('output_dir', help='Directory to write the raw hypothesis seeds') parser.add_argument( '-n', '--max_num_seeds_per_facet', type=int, default=None, help='If provided, only save up to <arg> seeds per facet') parser.add_argument( '-d', '--discard_failed_core_constraints', action='store_true', help='If specified, discard hypotheses with failed core constraints. ' 'Try not to use this one during evaluation at first, so that we ' 'do not discard hypotheses we might still need. If we have too many ' 'hypotheses and the script runs too slowly, then use this.') parser.add_argument( '-r', '--rank_cutoff', type=int, default=100, help= 'If specified, discard hypotheses early if there are at least <arg> ' 'other hypotheses that have the same fillers for a certain number ' '(default = 3) of their non-entrypoint query variables. We might ' 'need this in the evaluation if some facets have many variables ' 'that lead to combinatorial explosion.') parser.add_argument( '--frame_grouping', action='store_true', help= 'If specified, group query constraints by frames instead of by facets') parser.add_argument( '-f', '--force', action='store_true', help='If specified, overwrite existing output files without warning') args = parser.parse_args() json_graph = JsonGraph.from_dict( util.read_json_file(args.graph_path, 'JSON graph')) query_file_paths = util.get_file_list(args.query_path, suffix='.json', sort=True) output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force) for query_file_path in query_file_paths: query_json = util.read_json_file(query_file_path, 'query') raw_seeds_json = make_cluster_seeds( json_graph=json_graph, query_json=query_json, max_num_seeds_per_facet=args.max_num_seeds_per_facet, frame_grouping=args.frame_grouping, discard_failed_core_constraints=args. discard_failed_core_constraints, rank_cutoff=args.rank_cutoff) # write hypotheses out in json format. output_path = output_dir / (query_file_path.name.split('_')[0] + '_seeds.json') logging.info( 'Writing raw hypothesis seeds of each facet to {} ...'.format( output_path)) with open(str(output_path), 'w') as fout: json.dump(raw_seeds_json, fout, indent=1)
def main(): parser = ArgumentParser() parser.add_argument( 'hypotheses_path', help='path to the input json file for hypotheses, or a directory with ' 'a list of hypotheses files') parser.add_argument( 'output_dir', help='directory to write the coref-recovered hypotheses') parser.add_argument('original_graph_path', help='path to the original graph json file') parser.add_argument('compressed_graph_path', help='path to the compressed graph json file') parser.add_argument('input_log_path', help='path to log file from coref compression') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() hypotheses_file_paths = util.get_file_list(args.hypotheses_path, suffix='.json', sort=True) output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force) original_graph_json = util.read_json_file(args.original_graph_path, 'original JSON graph') compressed_graph_json = util.read_json_file(args.compressed_graph_path, 'compressed JSON graph') input_log_json = util.read_json_file(args.input_log_path, 'coref log') for hypotheses_file_path in hypotheses_file_paths: input_hypotheses_json = util.read_json_file(hypotheses_file_path, 'hypotheses') # probs do not change output_hypotheses_json = { 'probs': input_hypotheses_json['probs'], 'support': [] } for compressed_hypothesis in input_hypotheses_json["support"]: original_hypothesis = {'statements': [], 'statementWeights': []} # The mapping from each original statement (before coref-compression) to its weight original_stmt_weight_mapping = {} # Set of cluster membership nodes to include in the original hypothesis cluster_membership_set = set() for compressed_stmt, stmt_weight in zip( compressed_hypothesis['statements'], compressed_hypothesis['statementWeights']): # Get the statement entry from the compressed graph compressed_stmt_entry = compressed_graph_json['theGraph'][ compressed_stmt] # Get the cluster(s) from the subject of the compressed statement stmt_subj_clusters = \ input_log_json['prototype_to_clusters'][compressed_stmt_entry['subject']] # Whether this is a type statement is_type_stmt = (compressed_stmt_entry['predicate'] == 'type') # Get the cluster(s) from the object of the compressed statement if it is an edge # statement if is_type_stmt: stmt_obj_clusters = None else: stmt_obj_clusters = \ input_log_json['prototype_to_clusters'][compressed_stmt_entry['object']] for original_stmt in input_log_json['new_stmt_to_old_stmts'][ compressed_stmt]: # Resolve the statements and weights before coref-compression if original_stmt not in original_stmt_weight_mapping: original_stmt_weight_mapping[ original_stmt] = stmt_weight elif original_stmt_weight_mapping[ original_stmt] < stmt_weight: original_stmt_weight_mapping[ original_stmt] = stmt_weight # Get the statement entry from the original graph original_stmt_entry = original_graph_json['theGraph'][ original_stmt] # Add cluster membership between the original subject and each subject cluster stmt_subj = original_stmt_entry['subject'] for stmt_subj_cluster in stmt_subj_clusters: cluster_membership_set.add( (stmt_subj, stmt_subj_cluster)) if is_type_stmt: assert original_stmt_entry['predicate'] == 'type' else: assert original_stmt_entry['predicate'] != 'type' # Add cluster membership between the original object and each object cluster stmt_obj = original_stmt_entry['object'] for stmt_obj_cluster in stmt_obj_clusters: cluster_membership_set.add( (stmt_obj, stmt_obj_cluster)) for original_stmt, stmt_weight in original_stmt_weight_mapping.items( ): original_hypothesis['statements'].append(original_stmt) original_hypothesis['statementWeights'].append(stmt_weight) original_hypothesis['clusterMemberships'] = list( cluster_membership_set) original_hypothesis['failedQueries'] = compressed_hypothesis[ 'failedQueries'] original_query_stmts = set() for compressed_query_stmt in compressed_hypothesis[ 'queryStatements']: original_query_stmts.update( input_log_json['new_stmt_to_old_stmts'] [compressed_query_stmt]) original_hypothesis['queryStatements'] = list(original_query_stmts) output_hypotheses_json['support'].append(original_hypothesis) if 'graph' in input_hypotheses_json: output_hypotheses_json['graph'] = input_hypotheses_json['graph'] if 'queries' in input_hypotheses_json: output_hypotheses_json['queries'] = input_hypotheses_json[ 'queries'] output_path = util.get_output_path(output_dir / hypotheses_file_path.name, overwrite_warning=not args.force) print('Writing coref-recovered hypotheses to {}'.format(output_path)) with open(str(output_path), 'w') as fout: json.dump(output_hypotheses_json, fout, indent=2)
def main(): parser = ArgumentParser() parser.add_argument('graph_path', help='path to the graph json file') parser.add_argument('hypotheses_path', help='path to the hypotheses json directory') parser.add_argument('kb_path', help='path to the TA2 KB file (in AIF)') parser.add_argument('output_dir', help='path to output directory') parser.add_argument('run_id', help='TA3 run ID') parser.add_argument('sin_id_prefix', help='prefix of SIN IDs to name the final hypotheses') parser.add_argument('--top', default=50, type=int, help='number of top hypothesis to output') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() json_graph = JsonGraph.from_dict( util.read_json_file(args.graph_path, 'JSON graph')) graph_mappings = json_graph.build_cluster_member_mappings() hypotheses_file_paths = util.get_file_list(args.hypotheses_path, suffix='.json', sort=True) # TODO: there is a known bug in rdflib that # rdflib.Literal("2008", datatype=rdflib.XSD.gYear) would be parsed into # rdflib.term.Literal(u'2008-01-01', datatype=rdflib.XSD.gYear) automatically, # because a `parse_date` function is invoked for all rdflib.XSD.gYear literals. # This is a temporary workaround to patch the _toPythonMapping locally. # c.f.: https://github.com/RDFLib/rdflib/issues/806 # noinspection PyProtectedMember rdflib.term._toPythonMapping.pop(rdflib.XSD['gYear']) print('Reading kb from {}'.format(args.kb_path)) kb_graph = Graph() kb_graph.parse(args.kb_path, format='ttl') kb_nodes_by_category = catalogue_kb_nodes(kb_graph) kb_stmt_key_mapping = index_statement_nodes( kb_graph, kb_nodes_by_category['Statement']) kb_cm_key_mapping = index_cluster_membership_nodes( kb_graph, kb_nodes_by_category['ClusterMembership']) kb_type_stmt_key_mapping = index_type_statement_nodes( kb_graph, kb_nodes_by_category['TypeStatement']) output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force) run_id = args.run_id sin_id_prefix = args.sin_id_prefix for hypotheses_file_path in hypotheses_file_paths: hypotheses_json = util.read_json_file(hypotheses_file_path, 'hypotheses') print('Found {} hypotheses with probability {}'.format( len(hypotheses_json['probs']), hypotheses_json['probs'])) soin_id = sin_id_prefix + '_' + hypotheses_file_path.stem.split('_')[0] frame_id = soin_id + '_F1' top_count = 0 for hypothesis_idx, prob in sorted(enumerate(hypotheses_json['probs']), key=itemgetter(1), reverse=True): if prob <= 0.0: hypothesis_weight = math.exp(prob / 2.0) else: hypothesis_weight = 0.0001 hypothesis = hypotheses_json['support'][hypothesis_idx] top_count += 1 hypothesis_id = '{}_hypothesis_{:0>3d}'.format(frame_id, top_count) subgraph = build_subgraph_for_hypothesis( kb_graph=kb_graph, kb_nodes_by_category=kb_nodes_by_category, kb_stmt_key_mapping=kb_stmt_key_mapping, kb_cm_key_mapping=kb_cm_key_mapping, kb_type_stmt_key_mapping=kb_type_stmt_key_mapping, json_graph=json_graph, graph_mappings=graph_mappings, hypothesis=hypothesis, hypothesis_id=hypothesis_id, hypothesis_weight=hypothesis_weight) output_path = output_dir / '{}.{}.{}.H{:0>3d}.ttl'.format( run_id, soin_id, frame_id, top_count) print('Writing hypothesis #{:>2d} with prob {:>6.2f} to {}'.format( top_count, prob, output_path)) with open(output_path, 'w') as fout: fout.write(print_graph(subgraph)) if top_count >= args.top: break
def main(): parser = ArgumentParser( description= 'Read in a TA2 KB and a (list of) XML-based Statement of Information Need ' 'definition, convert the KB to JSON format, then convert each SoIN to a JSON ' 'query by identifying and ranking entry points.') parser.add_argument('kb_path', help='Path to the input TA2 KB') parser.add_argument('graph_output_path', help='Path to write the JSON graph') parser.add_argument( '-s', '--soin_path', help= 'Path to the input SoIN file, or a directory containing multiple SoIN ' 'files; if not provided, will only transform the graph') parser.add_argument( '-q', '--query_output_dir', help= 'Directory to write the JSON queries, used when soin_path is provided') parser.add_argument( '-m', '--max_matches', type=int, default=50, help='The maximum number of EPs *per entry point description*') parser.add_argument( '-d', '--dup_kb', default=duplicate_kb_file, help='Path to the json file with duplicate KB ID mappings') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() kb_path = util.get_input_path(args.kb_path) graph_output_path = util.get_output_path(args.graph_output_path, overwrite_warning=not args.force) aida_graph = AidaGraph() aida_graph.build_graph(str(kb_path), fmt='ttl') json_graph = JsonGraph() json_graph.build_graph(aida_graph) logging.info('Writing JSON graph to {} ...'.format(graph_output_path)) with open(str(graph_output_path), 'w') as fout: json.dump(json_graph.as_dict(), fout, indent=1) logging.info('Done.') if args.soin_path is not None: assert args.query_output_dir is not None, 'Must provide query_output_dir' soin_path = util.get_input_path(args.soin_path) query_output_dir = util.get_output_dir( args.query_output_dir, overwrite_warning=not args.force) soin_file_paths = util.get_file_list(soin_path, suffix='.xml', sort=True) dup_kb_id_mapping = None if args.dup_kb is not None: dup_kb_id_mapping = util.read_json_file(args.dup_kb, 'duplicate KB ID mapping') logging.info('Getting Cluster Mappings ...') ere_to_prototypes = get_cluster_mappings(aida_graph) for soin_file_path in soin_file_paths: query_output_path = query_output_dir / (soin_file_path.stem + '_query.json') logging.info('Processing SOIN {} ...'.format(soin_file_path)) soin = SOIN.parse(str(soin_file_path), dup_kbid_mapping=dup_kb_id_mapping) logging.info('Resolving all entrypoints ...') soin.resolve(aida_graph, ere_to_prototypes, max_matches=args.max_matches) query_json = {'graph': kb_path.stem} query_json.update(soin.to_json()) logging.info( 'Writing JSON query to {} ...'.format(query_output_path)) with open(str(query_output_path), 'w') as fout: json.dump(query_json, fout, indent=1)