def make_kg2(curies_to_categories: dict, uri_to_curie_shortener: callable, map_category_label_to_iri: callable, owl_urls_and_files: tuple, output_file_name: str, test_mode: bool = False): owl_file_information_dict_list = [] # for each OWL file (or URL for an OWL file) described in the YAML config file... for ont_source_info_dict in owl_urls_and_files: if ont_source_info_dict['download']: # get the OWL file onto the local file system and get a full path to it local_file_name = kg2_util.download_file_if_not_exist_locally( ont_source_info_dict['url'], ont_source_info_dict['file']) else: local_file_name = ont_source_info_dict['file'] assert os.path.exists(ont_source_info_dict['file']) # load the OWL file dadta into an ontobio.ontol.Ontology data structure and information dictionary [ont, metadata_dict] = load_owl_file_return_ontology_and_metadata( local_file_name, ont_source_info_dict['url'], ont_source_info_dict['title']) metadata_dict['ontology'] = ont owl_file_information_dict_list.append(metadata_dict) kg2_util.log_message('Calling make_nodes_dict_from_ontologies_list') nodes_dict = make_nodes_dict_from_ontologies_list( owl_file_information_dict_list, curies_to_categories, uri_to_curie_shortener, map_category_label_to_iri) kg2_util.log_message('Calling make_map_of_node_ontology_ids_to_curie_ids') map_of_node_ontology_ids_to_curie_ids = make_map_of_node_ontology_ids_to_curie_ids( nodes_dict) kg2_util.log_message('Calling get_rels_dict') # get a dictionary of all relationships including xrefs as relationships all_rels_dict = get_rels_dict(nodes_dict, owl_file_information_dict_list, uri_to_curie_shortener, map_of_node_ontology_ids_to_curie_ids) kg2_dict = dict() kg2_dict['edges'] = [rel_dict for rel_dict in all_rels_dict.values()] kg2_util.log_message('Number of edges: ' + str(len(kg2_dict['edges']))) kg2_dict['nodes'] = list(nodes_dict.values()) kg2_util.log_message('Number of nodes: ' + str(len(kg2_dict['nodes']))) del nodes_dict # delete xrefs from all_nodes_dict for node_dict in kg2_dict['nodes']: del node_dict['xrefs'] del node_dict['ontology node ids'] kg2_util.log_message('Saving JSON file') kg2_util.save_json(kg2_dict, output_file_name, test_mode)
if __name__ == '__main__': args = make_arg_parser().parse_args() mysql_config_file = args.mysqlConfigFile mysql_db_name = args.mysqlDBName test_mode = args.test connection = pymysql.connect(read_default_file=mysql_config_file, db=mysql_db_name) preds_dict = dict() sql_statement = ( "SELECT PMID, SUBJECT_CUI, PREDICATE, OBJECT_CUI, DP, SENTENCE, SUBJECT_SCORE, " "OBJECT_SCORE, DATE_FORMAT(CURR_TIMESTAMP, '%Y-%m-%d %H:%i:%S') FROM ((PREDICATION NATURAL JOIN CITATIONS) " "NATURAL JOIN SENTENCE) NATURAL JOIN PREDICATION_AUX") if test_mode: sql_statement += " LIMIT 10000" results = { 'data_dictionary': [ 'pmid', 'subject_cui_str', 'predicate', 'object_cui_str', 'pub_date', 'sentence', 'subject_score', 'object_score', 'curr_timestamp' ] } with connection.cursor() as cursor: cursor.execute(sql_statement) results['rows'] = cursor.fetchall() connection.close() output_file_name = args.outputFile kg2_util.save_json(results, output_file_name, test_mode)
metabolite_count += 1 if metabolite_count <= 10000: hmdb_id = metabolite["accession"] nodes.append(make_node(metabolite, hmdb_id)) for edge in make_disease_edges(metabolite, hmdb_id): edges.append(edge) for edge in make_protein_edges(metabolite, hmdb_id): edges.append(edge) for edge in make_equivalencies(metabolite, hmdb_id): edges.append(edge) for edge in make_property_edges(metabolite, hmdb_id): edges.append(edge) else: break file_update_date = convert_date(os.path.getmtime(args.inputFile)) hmdb_kp_node = kg2_util.make_node(HMDB_PROVIDED_BY_CURIE_ID, HMDB_KB_IRI, "Human Metabolome Database", kg2_util.BIOLINK_CATEGORY_DATA_FILE, file_update_date, HMDB_PROVIDED_BY_CURIE_ID) nodes.append(hmdb_kp_node) print("Saving JSON at", date()) kg2_util.save_json({ "nodes": nodes, "edges": edges }, args.outputFile, args.test) print("Finished saving JSON at", date()) print("Script finished at", date())
for edge in get_physical_entity_characteristics(connection, test): edges.append(edge) for edge in get_members_of_set(connection, test): edges.append(edge) for edge in get_species(connection, test): edges.append(edge) return edges if __name__ == '__main__': args = get_args() connection = pymysql.connect(read_default_file=args.mysqlConfigFile, db=args.mysqlDBName) run_sql("SET SESSION group_concat_max_len=35000", connection) run_sql("SET SESSION sort_buffer_size=256000000", connection) nodes = get_nodes(connection, args.test) edges = get_edges(connection, args.test) kp_node = kg2_util.make_node(REACTOME_KB_CURIE_ID, REACTOME_KB_IRI, 'Reactome', kg2_util.BIOLINK_CATEGORY_DATA_FILE, None, REACTOME_KB_CURIE_ID) nodes.append(kp_node) graph = {'nodes': nodes, 'edges': edges} kg2_util.save_json(graph, args.outputFile, args.test)
update_date, other_synonyms) nodes.append(node_dict) ensembl_gene_curie_id = node_dict['id'] taxon_id_int = gene_dict.get('taxon_id', None) assert taxon_id_int == 9606, "unexpected taxon ID" edges.append(kg2_util.make_edge(ensembl_gene_curie_id, 'NCBITaxon:' + str(taxon_id_int), 'gene_found_in_organism', ENSEMBL_KB_IRI, update_date)) hgnc_list = gene_dict.get('HGNC', None) if hgnc_list is not None: for hgnc_curie in hgnc_list: edges.append(kg2_util.make_edge(ensembl_gene_curie_id, hgnc_curie, 'xref', ENSEMBL_KB_IRI, update_date)) return {'nodes': nodes, 'edges': edges} if __name__ == '__main__': args = get_args() input_file_name = args.inputFile[0] output_file_name = args.outputFile[0] test_mode = args.test graph = make_kg2_graph(input_file_name, test_mode) kg2_util.save_json(graph, output_file_name, test_mode)
import argparse import kg2_util def make_arg_parser(): arg_parser = argparse.ArgumentParser( description= 'sample_subgraph.py: sample a smaller subgraph of a KG in JSON format') arg_parser.add_argument('--test', dest='test', action="store_true", default=False) arg_parser.add_argument('inputFile', type=str, nargs=1) arg_parser.add_argument('outputFile', type=str, nargs=1) return arg_parser if __name__ == '__main__': args = make_arg_parser().parse_args() input_file_name = args.inputFile output_file_name = args.outputFile graph = kg2_util.load_json(input_file_name) nodes = [graph['nodes'][i] for i in range(0, len(graph['nodes']), 5)] nodes_id_set = set([node['id'] for node in nodes]) edges = [ edge for edge in graph['edges'] if edge['subject'] in nodes_id_set and edge['object'] in nodes_id_set ] kg2_util.save_json({'nodes': nodes, 'edges': edges}, output_file_name)
kg_edges_file_names = args.kgFileNewEdges test_mode = args.test output_file_name = args.outputFile[0] kg = json.load(open(kg_file_name, 'r')) kg_orphan_edges = {'edges': []} for kg_edges_file_name in kg_edges_file_names: kg_orphan_edges_new = [] ctr_edges_added = 0 kg_edges_new = json.load(open(kg_edges_file_name, 'r')) nodes_dict = {node['id']: node for node in kg['nodes']} for rel_dict in kg_edges_new['edges']: subject_curie = rel_dict['subject'] object_curie = rel_dict['object'] if subject_curie in nodes_dict and object_curie in nodes_dict: ctr_edges_added += 1 kg['edges'].append(rel_dict) else: kg_orphan_edges_new.append(rel_dict) kg_orphan_edges['edges'] += kg_orphan_edges_new kg2_util.log_message("number edges added: " + str(ctr_edges_added), ontology_name=kg_edges_file_name, output_stream=sys.stderr) kg2_util.log_message("number of orphan edges: " + str(len(kg_orphan_edges['edges'])), ontology_name=kg_edges_file_name, output_stream=sys.stderr) kg2_util.save_json(kg, output_file_name, test_mode) kg_file_orphan_edges = args.kgFileOrphanEdges if kg_file_orphan_edges is not None: kg2_util.save_json(kg_orphan_edges, kg_file_orphan_edges, test_mode)
if __name__ == '__main__': args = get_args() edges = [] nodes = [] test_mode = args.test with open(args.inputFile, 'r') as input_file: json_data = json.load(input_file) update_date = json_data['version'][0]['dtime'] edges = process_external_ids(json_data['external_ids'], update_date, test_mode) edges += process_omop_relations(json_data['omop_relations'], update_date, test_mode) edges += process_faers_data(json_data['faers_data'], update_date, test_mode) edges += process_atc_codes(json_data['atc_ids'], update_date, test_mode) edges += process_bioactivities(json_data['bioactivities'], update_date, test_mode) edges += process_pharmacologic_actions( json_data['pharmacologic_action'], update_date, test_mode) nodes = make_nodes(json_data['drugcentral_ids'], update_date) kp_node = kg2_util.make_node(DRUGCENTRAL_SOURCE, BASE_URL_DRUGCENTRAL, 'DrugCentral', kg2_util.BIOLINK_CATEGORY_DATA_FILE, update_date, DRUGCENTRAL_SOURCE) nodes.append(kp_node) graph = {'edges': edges, 'nodes': nodes} kg2_util.save_json(graph, args.outputFile, test_mode)
return arg_parser if __name__ == "__main__": node_set = set(["name", "id", "full name", "category label"]) edge_set = set([ "simplified relation curie", "subject", "object", "simplified edge label", "provided by" ]) args = make_arg_parser().parse_args() test_mode = args.test reduced = {"nodes": [], "edges": []} with open(args.inputFilepath, "r") as fp: all_data = json.load(fp) for node in all_data["nodes"]: temp_node = {} for key, val in node.items(): if key in node_set: temp_node[key] = val reduced["nodes"].append(temp_node) for edge in all_data["edges"]: temp_edge = {} for key, val in edge.items(): if key in edge_set: temp_edge[key] = val reduced["edges"].append(temp_edge) kg2_util.save_json(reduced, args.outputFilepath, test_mode)
evidence_score, created_date, update_date, pmid, source ] = line if source != 'BEFREE': non_befree_count += 1 subject_id = format_id(subject_id, kg2_util.CURIE_PREFIX_NCBI_GENE) object_id = format_id(object_id, kg2_util.CURIE_PREFIX_UMLS) predicate = kg2_util.EDGE_LABEL_BIOLINK_GENE_ASSOCIATED_WITH_CONDITION edge = kg2_util.make_edge_biolink(subject_id, object_id, predicate, DISGENET_KB_CURIE, update_date) publication = kg2_util.CURIE_PREFIX_PMID + ':' + pmid edge['publications'] = [publication] edges.append(edge) return edges if __name__ == '__main__': args = get_args() input_file = args.inputFile output_file = args.outputFile edges = make_edges(input_file, args.test) nodes = [] kp_node = kg2_util.make_node(DISGENET_KB_CURIE, DISGENET_BASE_IRI, "DisGeNET", kg2_util.BIOLINK_CATEGORY_DATA_FILE, None, DISGENET_KB_CURIE) nodes.append(kp_node) graph = {"edges": edges, "nodes": nodes} kg2_util.save_json(graph, output_file, args.test)
def make_arg_parser(): arg_parser = argparse.ArgumentParser( description= 'sample_subgraph.py: sample a smaller subgraph of a KG in JSON format') arg_parser.add_argument('--test', dest='test', action="store_true", default=False) arg_parser.add_argument('inputFile', type=str, nargs=1) arg_parser.add_argument('outputFile', type=str, nargs=1) return arg_parser if __name__ == '__main__': args = make_arg_parser().parse_args() input_file_name = args.inputFile output_file_name = args.outputFile graph = kg2_util.load_json(input_file_name) nodes = [graph['nodes'][i] for i in range(0, len(graph['nodes']), 5)] nodes_id_set = set([node['id'] for node in nodes]) edges = [ edge for edge in graph['edges'] if edge['subject'] in nodes_id_set and edge['object'] in nodes_id_set ] build = graph.get('build', None) out_graph = {'nodes': nodes, 'edges': edges} if build is not None: out_graph['build'] = build kg2_util.save_json(out_graph, output_file_name)
"http://rest.kegg.jp/conv/glycan/chebi", "http://rest.kegg.jp/conv/drug/chebi"] get_base_query = "http://rest.kegg.jp/get/" for query in list_queries: for results in send_query(query).split('\n'): if len(results) < 1: continue results = results.split('\t') results_dict[results[0]] = {'name': results[1]} for query in conv_queries: for results in send_query(query).split('\n'): if len(results) < 1: continue results = results.split('\t') results_dict[results[1]]['eq_id'] = results[0] kegg_ids = len(results_dict.keys()) get_count = 0 for kegg_id in results_dict: previous_line_starter = '' results = send_query(get_base_query + kegg_id) results_dict = process_get_query(results, results_dict, kegg_id) get_count += 1 if get_count % 1000 == 0: print("Processed", get_count, "out of", kegg_ids, "at", date()) return results_dict if __name__ == '__main__': args = get_args() kg2_util.save_json(run_queries(), args.outputFile, True)