def main(): parser = ArgumentParser() parser.add_argument('input_path', help='path to the input Excel ontology file') parser.add_argument('output_path', help='path to write the JSON ontology file') args = parser.parse_args() input_path = util.get_input_path(args.input_path) df = pandas.read_excel(str(input_path), sheet_name=None) event_records = df['events'].to_dict('records') relation_records = df['relations'].to_dict('records') roles_ontology = defaultdict(dict) for ev in event_records: ev_type = get_type_str(ev) for arg_idx in range(1, 6): arg_key = f'arg{arg_idx} label' if isinstance(ev[arg_key], str): roles_ontology[ev_type][f'arg{arg_idx}'] = ev[arg_key] for rel in relation_records: rel_type = get_type_str(rel) roles_ontology[rel_type]['arg1'] = rel['arg1 label'] roles_ontology[rel_type]['arg2'] = rel['arg2 label'] output_path = util.get_output_path(args.output_path) with open(str(output_path), 'w') as fout: json.dump(roles_ontology, fout, indent=2)
def main(): parser = argparse.ArgumentParser() parser.add_argument("working_dir", help='path to the working directory.') parser.add_argument( '--subgraph_dir', default='subgraph', help='name of the subdirectory in working_dir containing subgraphs') parser.add_argument( '--seed_dir', default='cluster_seeds', help='name of the subdirectory in working_dir containing cluster seeds' ) parser.add_argument( '--indexed_data_dir', default='data_indexed', help='name of the subdirectory in working_dir to write indexed data') parser.add_argument("--indexer_path", default='resources/indexer.p', help='path to the indexer file.') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() working_dir = util.get_input_path(args.working_dir) subgraph_dir = util.get_input_path(working_dir / args.subgraph_dir) seed_dir = util.get_input_path(working_dir / args.seed_dir) output_dir = util.get_output_dir(working_dir / args.indexed_data_dir, overwrite_warning=not args.force) indexer_path = str(util.get_input_path(args.indexer_path)) locals().update(vars(args)) seed_subgraph_map = map_subgraph_to_seed(subgraph_dir, seed_dir) print('\nIndexing files ...') index_and_partition(seed_subgraph_map, output_dir, indexer_path) print(f'\nIndexing finished: indexed data in directory {output_dir}')
def main(): parser = argparse.ArgumentParser() parser.add_argument('input_path', help='Path to a TA2 KB or a JSON graph') args = parser.parse_args() input_path = util.get_input_path(args.input_path) if input_path.suffix == '.ttl': aida_graph = AidaGraph() aida_graph.build_graph(str(input_path), fmt='ttl') get_kb_stats(aida_graph) elif input_path.suffix == '.json': with open(input_path, 'r') as fin: json_graph = JsonGraph.from_dict(json.load(fin)) get_json_stats(json_graph)
def main(): parser = ArgumentParser() parser.add_argument('graph_path', help='path to the graph json file') parser.add_argument('hypotheses_path', help='path to the hypotheses json file') parser.add_argument('db_dir', help='directory with copies of tdb databases') parser.add_argument('output_dir', help='path to output directory') parser.add_argument('--top', default=50, type=int, help='number of top hypothesis to output') parser.add_argument('--dry_run', action='store_true', help='if specified, only write the SPARQL queries to ' 'files, without actually executing the queries') parser.add_argument('--query_just', action='store_true') parser.add_argument('--query_conf', action='store_true') parser.add_argument('-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph')) mappings = json_graph.build_cluster_member_mappings() member_to_clusters = mappings['member_to_clusters'] cluster_to_prototype = mappings['cluster_to_prototype'] prototype_set = set(mappings['prototype_to_clusters'].keys()) hypotheses_json = util.read_json_file(args.hypotheses_path, 'hypotheses') output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force) db_dir = util.get_input_path(args.db_dir) db_path_list = [str(path) for path in sorted(db_dir.glob('copy*'))] print('Using the following tdb databases to query: {}'.format(db_path_list)) num_node_queries = len(db_path_list) top_count = 0 for result_idx, prob in sorted( enumerate(hypotheses_json['probs']), key=itemgetter(1), reverse=True): hypothesis = hypotheses_json['support'][result_idx] # node_query_list, stmt_query_list, just_query_list, conf_query_list = \ sparql_query_str = \ queries_for_aida_result( json_graph=json_graph, hypothesis=hypothesis, member_to_clusters=member_to_clusters, cluster_to_prototype=cluster_to_prototype, prototype_set=prototype_set, num_node_queries=num_node_queries, query_just=args.query_just, query_conf=args.query_conf) top_count += 1 print(f'Writing queries for hypothesis #{top_count} with prob {prob}') sparql_query_path = output_dir / 'hypothesis-{:0>3d}-query.rq'.format(top_count) with open(str(sparql_query_path), 'w') as fout: fout.write(sparql_query_str + '\n') if not args.dry_run: query_result_path = output_dir / 'hypothesis-{:0>3d}-raw.ttl'.format(top_count) query_cmd = 'echo "query {0}"; tdbquery --loc {1} --query {0} > {2}; '.format( sparql_query_path, db_path_list[0], query_result_path) print('Executing queries ...') process = subprocess.Popen(query_cmd, shell=True) process.wait() # sparql_helper.execute_sparql_queries( # node_query_list, stmt_query_list, just_query_list, conf_query_list, # db_path_list, output_dir, # filename_prefix='hypothesis-{:0>3d}'.format(top_count), # header_prefixes=AIF_HEADER_PREFIXES, dry_run=args.dry_run) if top_count >= args.top: break
def main(): parser = argparse.ArgumentParser() # For qualitative evaluation only parser.add_argument('working_dir', help='path to the working directory') parser.add_argument('--seed_dir', default='cluster_seeds', help='name of the subdirectory in working_dir containing cluster seeds') parser.add_argument('--indexed_data_dir', default='data_indexed', help='name of the subdirectory in working_dir containing indexed data') parser.add_argument('--output_dir', default='result_jsons', help='name of the subdirectory in working_dir to write output hypotheses') parser.add_argument("--indexer_path", default="resources/indexers.p", help='path to the indexer file') parser.add_argument("--model_path", default="resources/gcn2-cuda_best_15000_0.ckpt", help='path to the pre-trained model checkpoint') parser.add_argument("--device", type=int, default=-1) parser.add_argument("--attention_type", type=str, default='concat') parser.add_argument("--num_layers", type=int, default=2) parser.add_argument("--hidden_size", type=int, default=300) parser.add_argument("--attention_size", type=int, default=300) parser.add_argument("--conv_dropout", type=float, default=.5) parser.add_argument("--attention_dropout", type=float, default=.3) parser.add_argument('-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') # parser.add_argument("--use_highest_ranked_gold", action='store_true') # parser.add_argument("--force", action='store_true') # parser.add_argument("--init_prob_force", type=float, default=.95) # parser.add_argument("--force_decay", type=float, default=.9684) # parser.add_argument("--force_every", type=int, default=1000) # parser.add_argument("--batch_size", type=int, default=4) # parser.add_argument("--self_attention_type", type=str, default='concat') # parser.add_argument("--num_epochs", type=int, default=2) # parser.add_argument("--learning_rate", type=float, default=1e-5) # parser.add_argument("--save_path", type=str, default="Saved_Models") # parser.add_argument("--save_tag", type=str, default="") # parser.add_argument("--eval_tag", type=str, default="") # parser.add_argument("--valid_every", type=int, default=5000) # parser.add_argument("--print_every", type=int, default=100) args = parser.parse_args() if args.device == -1: device = torch.device('cpu') else: device = torch.device(f'cuda:{args.device}') print(f'\nConfig:{args}') torch.manual_seed(0) np.random.seed(0) indexer_path = str(util.get_input_path(args.indexer_path)) with open(indexer_path, 'rb') as fin: ere_indexer, stmt_indexer, ere_emb_mat, stmt_emb_mat, num_word2vec_ere, num_word2vec_stmt = dill.load(fin) indexer_info_dict = dict() indexer_info_dict['ere_indexer'] = ere_indexer indexer_info_dict['stmt_indexer'] = stmt_indexer indexer_info_dict['ere_emb_mat'] = ere_emb_mat indexer_info_dict['stmt_emb_mat'] = stmt_emb_mat indexer_info_dict['num_word2vec_ere'] = num_word2vec_ere indexer_info_dict['num_word2vec_stmt'] = num_word2vec_stmt model = CoherenceNetWithGCN( False, indexer_info_dict, args.attention_type, None, args.num_layers, args.hidden_size, args.attention_size, args.conv_dropout, args.attention_dropout) model_path = str(util.get_input_path(args.model_path)) model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))['model']) model.to(device) model.eval() working_dir = util.get_input_path(args.working_dir) seed_dir = util.get_input_path(working_dir / args.seed_dir) indexed_data_dir = util.get_input_path(working_dir / args.indexed_data_dir) output_dir = util.get_output_dir(working_dir / args.output_dir, overwrite_warning=not args.force) print('\nExpanding cluster seeds ...') evaluate(seed_dir, indexed_data_dir, output_dir, model, device) print(f'\nExpanding finished: raw hypotheses in directory {output_dir}')
def main(): parser = ArgumentParser( description= 'Read in a TA2 KB and a (list of) XML-based Statement of Information Need ' 'definition, convert the KB to JSON format, then convert each SoIN to a JSON ' 'query by identifying and ranking entry points.') parser.add_argument('kb_path', help='Path to the input TA2 KB') parser.add_argument('graph_output_path', help='Path to write the JSON graph') parser.add_argument( '-s', '--soin_path', help= 'Path to the input SoIN file, or a directory containing multiple SoIN ' 'files; if not provided, will only transform the graph') parser.add_argument( '-q', '--query_output_dir', help= 'Directory to write the JSON queries, used when soin_path is provided') parser.add_argument( '-m', '--max_matches', type=int, default=50, help='The maximum number of EPs *per entry point description*') parser.add_argument( '-d', '--dup_kb', default=duplicate_kb_file, help='Path to the json file with duplicate KB ID mappings') parser.add_argument( '-f', '--force', action='store_true', default=False, help='If specified, overwrite existing output files without warning') args = parser.parse_args() kb_path = util.get_input_path(args.kb_path) graph_output_path = util.get_output_path(args.graph_output_path, overwrite_warning=not args.force) aida_graph = AidaGraph() aida_graph.build_graph(str(kb_path), fmt='ttl') json_graph = JsonGraph() json_graph.build_graph(aida_graph) logging.info('Writing JSON graph to {} ...'.format(graph_output_path)) with open(str(graph_output_path), 'w') as fout: json.dump(json_graph.as_dict(), fout, indent=1) logging.info('Done.') if args.soin_path is not None: assert args.query_output_dir is not None, 'Must provide query_output_dir' soin_path = util.get_input_path(args.soin_path) query_output_dir = util.get_output_dir( args.query_output_dir, overwrite_warning=not args.force) soin_file_paths = util.get_file_list(soin_path, suffix='.xml', sort=True) dup_kb_id_mapping = None if args.dup_kb is not None: dup_kb_id_mapping = util.read_json_file(args.dup_kb, 'duplicate KB ID mapping') logging.info('Getting Cluster Mappings ...') ere_to_prototypes = get_cluster_mappings(aida_graph) for soin_file_path in soin_file_paths: query_output_path = query_output_dir / (soin_file_path.stem + '_query.json') logging.info('Processing SOIN {} ...'.format(soin_file_path)) soin = SOIN.parse(str(soin_file_path), dup_kbid_mapping=dup_kb_id_mapping) logging.info('Resolving all entrypoints ...') soin.resolve(aida_graph, ere_to_prototypes, max_matches=args.max_matches) query_json = {'graph': kb_path.stem} query_json.update(soin.to_json()) logging.info( 'Writing JSON query to {} ...'.format(query_output_path)) with open(str(query_output_path), 'w') as fout: json.dump(query_json, fout, indent=1)