def make_nodes(drugcentral_ids, update_date): nodes = [] reformatted_json = dict() category_label = kg2_util.BIOLINK_CATEGORY_DRUG for name_row in drugcentral_ids: drug_central_id = name_row['id'] name = name_row['name'] if len(drug_central_id) < 1: continue drug_central_id = format_drugcentral_id(drug_central_id) if drug_central_id not in reformatted_json: reformatted_json[drug_central_id] = dict() reformatted_json[drug_central_id]['synonyms'] = [] if name_row['preferred_name'] == "1": reformatted_json[drug_central_id]['name'] = name else: reformatted_json[drug_central_id]['synonyms'].append(name) for node_id in reformatted_json: synonyms = reformatted_json[node_id]['synonyms'] name = reformatted_json[node_id]['name'] iri = BASE_URL_DRUGCENTRAL + node_id.split(':')[1] provided_by = DRUGCENTRAL_SOURCE node = kg2_util.make_node(node_id, iri, name, category_label, update_date, provided_by) node['synonym'] = synonyms nodes.append(node) return nodes
def make_kg2_graph(drugbank_dict: dict, test_mode: bool): drugs = drugbank_dict["drugbank"]["drug"] nodes = [] edges = [] update_date = drugbank_dict["drugbank"]["@exported-on"] drugbank_kp_node = kg2_util.make_node(DRUGBANK_KB_CURIE_ID, DRUGBANK_KB_IRI, "DrugBank", kg2_util.BIOLINK_CATEGORY_DATA_FILE, update_date, DRUGBANK_KB_CURIE_ID) nodes.append(drugbank_kp_node) drug_ctr = 0 for drug in drugs: drug_ctr += 1 if test_mode and drug_ctr > 10000: break node = make_node(drug) if node is not None: nodes.append(node) for edge in make_edges(drug): if edge is not None: edges.append(edge) return {"nodes": nodes, "edges": edges}
def make_node(metabolite: dict, hmdb_id: str): iri = HMDB_BASE_IRI + hmdb_id name = metabolite["name"] category_label = kg2_util.BIOLINK_CATEGORY_METABOLITE update_date = metabolite["update_date"] creation_date = metabolite["creation_date"] provided_by = HMDB_PROVIDED_BY_CURIE_ID description = metabolite["description"] synonyms = [] if (isinstance(metabolite["synonyms"], dict) and "synonym" in metabolite["synonyms"]): synonym_store = metabolite["synonyms"]["synonym"] if isinstance(synonym_store, list): for synonym in synonym_store: synonyms.append(synonym) else: synonyms.append(synonym_store) general_references = pull_out_references(metabolite["general_references"]) publications = [reference for reference in general_references.keys()] node = kg2_util.make_node(CURIE_PREFIX_HMDB + ":" + hmdb_id, iri, name, category_label, update_date, provided_by) node["description"] = description node["synonym"] = synonyms node["creation_date"] = creation_date node["publications"] = publications return node
def make_node(id: str, iri: str, name: str, category_label: str, description: str, synonym: list, publications: list, update_date: str): node_dict = kg2_util.make_node(id, iri, name, category_label, update_date, CHEMBL_KB_IRI) node_dict['description'] = description node_dict['synonym'] = synonyms node_dict['publications'] = publications return node_dict
def make_node_and_edges(article: dict, mesh_predicate_label: str, mesh_relation_curie: str): nodes = [] edges = [] article_citation = article["MedlineCitation"] pmid = kg2_util.CURIE_PREFIX_PMID + ":" + article_citation["PMID"]["#text"] update_date = extract_date(article_citation["DateRevised"]) if pmid in pmids: # These aren't necessary yet, but it might be someday, so I wrote # and tested a couple of functions to extract them #authors = get_authors(article_citation) #journal = get_journal(article_citation) name = article_citation["Article"]["ArticleTitle"] if isinstance(name, dict): try: name = name["#text"] except: temp_name = name for key in temp_name: name = temp_name[key]["#text"] try: created_date = extract_date( article_citation["Article"]["ArticleDate"]) except: created_date = None iri = PMID_BASE_IRI + article_citation["PMID"]["#text"] node = kg2_util.make_node(pmid, iri, name, BIOLINK_CATEGORY_PUBLICATION, update_date, PMID_PROVIDED_BY_CURIE_ID) node["creation_date"] = created_date nodes.append(node) try: for mesh_topic in ( article_citation["MeshHeadingList"]["MeshHeading"]): mesh_id = kg2_util.CURIE_PREFIX_MESH + ":" + \ mesh_topic["DescriptorName"]["@UI"] edge = kg2_util.make_edge(pmid, mesh_id, mesh_relation_curie, mesh_predicate_label, PMID_PROVIDED_BY_CURIE_ID, update_date) edges.append(edge) except: mesh_id = None return [{"nodes": nodes, "edges": edges}, update_date]
def format_node(drugbank_id: str, description: str, name: str, update_date: str, synonyms: list, publications: list, category_label: str, creation_date: str): iri = DRUGBANK_BASE_IRI + drugbank_id node_curie = kg2_util.CURIE_PREFIX_DRUGBANK + ":" + drugbank_id node_dict = kg2_util.make_node(node_curie, iri, name, category_label, update_date, DRUGBANK_KB_CURIE_ID) node_dict["synonym"] = synonyms node_dict["creation_date"] = creation_date node_dict["description"] = description node_dict["publications"] = publications return node_dict
def make_kg2_graph(input_file_name: str, test_mode: bool = False): ensembl_data = kg2_util.load_json(input_file_name) nodes = [] edges = [] genebuild_str = ensembl_data['genebuild'] update_date = genebuild_str.split('/')[1] gene_ctr = 0 ontology_curie_id = ENSEMBL_KB_CURIE_ID ens_kp_node = kg2_util.make_node(ontology_curie_id, ENSEMBL_KB_URI, 'Ensembl Genes', kg2_util.BIOLINK_CATEGORY_DATA_FILE, update_date, ontology_curie_id) nodes.append(ens_kp_node) for gene_dict in ensembl_data['genes']: gene_ctr += 1 if test_mode and gene_ctr > 10000: break ensembl_gene_id = gene_dict['id'] description = gene_dict.get('description', None) gene_symbol = gene_dict.get('name', None) other_synonyms = [] xrefs = gene_dict.get('xrefs', None) if xrefs is not None: other_synonyms = list( set([ xref['primary_id'] for xref in xrefs if xref['primary_id'] != ensembl_gene_id ])) node_dict = make_node(ensembl_gene_id, description, gene_symbol, update_date, other_synonyms) nodes.append(node_dict) ensembl_gene_curie_id = node_dict['id'] taxon_id_int = gene_dict.get('taxon_id', None) assert taxon_id_int == 9606, "unexpected taxon ID" edges.append( kg2_util.make_edge_biolink( ensembl_gene_curie_id, kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + str(taxon_id_int), kg2_util.EDGE_LABEL_BIOLINK_IN_TAXON, ENSEMBL_KB_CURIE_ID, update_date)) hgnc_list = gene_dict.get('HGNC', None) if hgnc_list is not None: for hgnc_curie in hgnc_list: edges.append( kg2_util.make_edge(ensembl_gene_curie_id, hgnc_curie, kg2_util.CURIE_ID_OWL_SAME_AS, kg2_util.EDGE_LABEL_OWL_SAME_AS, ENSEMBL_KB_CURIE_ID, update_date)) return {'nodes': nodes, 'edges': edges}
def make_node(ncbi_gene_id: str, full_name: str, gene_symbol: str, update_date: str, other_synonyms: list = None): category_label = kg2_util.BIOLINK_CATEGORY_GENE if other_synonyms is None: other_synonyms = [] node_curie = kg2_util.CURIE_PREFIX_NCBI_GENE + ':' + ncbi_gene_id iri = NCBI_BASE_IRI + ncbi_gene_id node_dict = kg2_util.make_node(node_curie, iri, full_name, category_label, update_date, NCBI_KB_CURIE_ID) node_dict['synonym'] = [gene_symbol] + sorted(list(set(other_synonyms))) return node_dict
def make_node(ensembl_gene_id: str, description: str, gene_symbol: str, update_date: str, other_synonyms: list = None): category_label = 'gene' if other_synonyms is None: other_synonyms = [] node_curie = kg2_util.CURIE_PREFIX_ENSEMBL + ':' + ensembl_gene_id iri = ENSEMBL_BASE_IRI + ensembl_gene_id node_dict = kg2_util.make_node(node_curie, iri, description, category_label, update_date, ENSEMBL_KB_IRI) node_dict['synonym'] = [gene_symbol] + list(set(other_synonyms)) return node_dict
def make_node(ensembl_gene_id: str, description: str, gene_symbol: str, update_date: str, other_synonyms: list = None): category_label = kg2_util.BIOLINK_CATEGORY_GENE if other_synonyms is None: other_synonyms = [] node_curie = kg2_util.CURIE_PREFIX_ENSEMBL + ':' + ensembl_gene_id iri = ENSEMBL_BASE_IRI + ensembl_gene_id node_dict = kg2_util.make_node(node_curie, iri, description, category_label, update_date, ENSEMBL_KB_CURIE_ID) node_dict['name'] = gene_symbol node_dict['synonym'] = [gene_symbol] + sorted(list(set(other_synonyms))) return node_dict
def make_node(ncbi_gene_id: str, full_name: str, gene_symbol: str, update_date: str, category_label: str, other_synonyms: list = None) -> dict: if other_synonyms is None: other_synonyms = [] node_curie = kg2_util.CURIE_PREFIX_NCBI_GENE + ':' + ncbi_gene_id iri = NCBI_BASE_IRI + ncbi_gene_id node_dict = kg2_util.make_node(node_curie, iri, full_name, category_label, update_date, NCBI_KB_CURIE_ID) node_dict['synonym'] = [gene_symbol] + sorted(list(set(other_synonyms))) node_dict['name'] = "Genetic locus associated with " + gene_symbol return node_dict
def make_node(ncbi_gene_id: str, full_name: str, gene_symbol: str, update_date: str, other_synonyms: list = None): category_label = 'gene' if other_synonyms is None: other_synonyms = [] node_curie = kg2_util.CURIE_PREFIX_NCBI_GENE + ':' + ncbi_gene_id iri = NCBI_BASE_IRI + '/' + ncbi_gene_id node_dict = kg2_util.make_node(node_curie, iri, full_name, category_label, update_date, NCBI_BASE_IRI) node_dict['synonym'] = list(set([gene_symbol] + other_synonyms)) return node_dict
def make_kg2_graph(kegg, update_date): nodes = [] edges = [] for kegg_id in kegg: kegg_dict = kegg[kegg_id] if kegg_id.startswith(KEGG_COMPOUND_PREFIX): node, compound_edges = process_compound(kegg_dict, kegg_id, update_date) nodes.append(node) edges += compound_edges kegg_kp_node = kg2_util.make_node(KEGG_PROVIDED_BY, KEGG_SOURCE_IRI, 'Kyoto Encyclopedia of Genes and Genomes', kg2_util.BIOLINK_CATEGORY_DATA_FILE, update_date, KEGG_PROVIDED_BY) nodes.append(kegg_kp_node) return {'nodes': nodes, 'edges': edges}
def make_node(id: str, iri: str, name: str, category_label: str, description: str, synonym: list, publications: list, update_date: str, canonical_smiles: str = None): node_dict = kg2_util.make_node(id, iri, name, category_label, update_date, CHEMBL_KB_CURIE_ID) node_dict['description'] = description node_dict['synonym'] = sorted(synonym) node_dict['publications'] = sorted(publications) node_dict['has_biological_sequence'] = canonical_smiles return node_dict
def format_node(node_id, name, category_label, update_date, description=None, sequence=None, synonym=[]): iri = KEGG_BASE_IRI + node_id curie_id = format_id(node_id) node = kg2_util.make_node(curie_id, iri, name, category_label, update_date, KEGG_PROVIDED_BY) node['description'] = description if sequence is not None and len(sequence) > 0: node['has_biological_sequence'] = sequence node['synonym'] = synonym return node
def make_nodes(entries, test_mode): nodes = [] all_xrefs = dict() nodes_to_species = dict() entry_count = 0 for entry in entries: species = entry['ID'].split(';')[2].strip() species_id = only_include_certain_species(species) if not species_id: continue entry_count += 1 if test_mode and entry_count > 1000: break node_id = get_node_id(entry) node_iri = kg2_util.BASE_URL_MIRBASE + node_id.split(':')[1] node_category = kg2_util.BIOLINK_CATEGORY_MICRORNA node_name = entry['DE'].strip() description = entry.get('CC', '').replace('\t', ' ').replace(' ', ' ') sequence = entry.get('SQ', '').replace('\t', ' ') publications = entry.get('RX', None) xrefs = entry.get('DR', None) if xrefs is not None: xrefs = [ format_xref(xref) for xref in xrefs.split('\t') if format_xref(xref) is not None ] all_xrefs[node_id] = xrefs if publications is not None: publications = [ format_publication(publication) for publication in publications.split('\t') ] node = kg2_util.make_node(node_id, node_iri, node_name, node_category, None, MIRBASE_KB_CURIE_ID) node['description'] = description node['publications'] = publications node['has_biological_sequence'] = sequence.strip('Sequence ') nodes.append(node) nodes_to_species[node_id] = species_id return [nodes, all_xrefs, nodes_to_species]
def make_kg2_graph(input_file_name: str, test_mode: bool = False): update_date = os.path.getmtime(input_file_name) nodes = [ kg2_util.make_node(id=REPODB_CURIE + ':', iri=REPODB_IRI, name='repoDB drug repositioning database', category_label=kg2_util.BIOLINK_CATEGORY_DATA_FILE, update_date=update_date, provided_by=REPODB_CURIE + ':') ] edges = [] df = pd.read_csv(input_file_name) for idx in range(len(df)): if not df['status'].isna()[idx]: status = df['status'][idx].lower() else: status = "unknown_status" if not df['phase'].isna()[idx]: phase = df['phase'][idx].lower().replace(" ", "_").replace("/", "_or_") else: phase = "unknown_phase" relation = "clinically_tested_" + status + "_" + phase edge_dict = kg2_util.make_edge( subject_id=DRUGBANK_CURIE + ':' + df['drug_id'][idx], object_id=UMLS_CURIE + ':' + df['ind_id'][idx], relation_curie=REPODB_CURIE + ':' + relation, relation_label=relation, provided_by=REPODB_CURIE + ':', update_date=None) if not df['NCT'].isna()[idx]: edge_dict['publications'].append(NCT_CURIE + df['NCT'][idx]) edge_dict['publications_info'][ NCT_CURIE + df['NCT'][idx]] = CLINICALTRIALS_IRI + df['NCT'][idx] edges.append(edge_dict) return {'nodes': nodes, 'edges': edges}
if not record_of_relation_curie_occurrences[relation_curie]: print( 'relation curie is in the config file but was not used in any edge in the graph: ' + relation_curie, file=sys.stderr) for relation_curie in relation_curies_not_in_nodes: print('could not find a node for relation curie: ' + relation_curie) update_date = datetime.now().strftime("%Y-%m-%d %H:%M") version_file = open(args.versionFile, 'r') build_name = str for line in version_file: test_flag = "" if test_mode: test_flag = "-TEST" build_name = "RTX KG" + line.rstrip() + test_flag break build_node = kg2_util.make_node(kg2_util.CURIE_PREFIX_RTX + ':' + 'KG2', kg2_util.BASE_URL_RTX + 'KG2', build_name, kg2_util.BIOLINK_CATEGORY_DATA_FILE, update_date, kg2_util.CURIE_PREFIX_RTX + ':') build_info = { 'version': build_node['name'], 'timestamp_utc': build_node['update_date'] } pprint.pprint(build_info) graph["build"] = build_info graph["nodes"].append(build_node) kg2_util.save_json(graph, output_file_name, test_mode) del graph
metabolite_count += 1 if metabolite_count <= 10000: hmdb_id = metabolite["accession"] nodes.append(make_node(metabolite, hmdb_id)) for edge in make_disease_edges(metabolite, hmdb_id): edges.append(edge) for edge in make_protein_edges(metabolite, hmdb_id): edges.append(edge) for edge in make_equivalencies(metabolite, hmdb_id): edges.append(edge) for edge in make_property_edges(metabolite, hmdb_id): edges.append(edge) else: break file_update_date = convert_date(os.path.getmtime(args.inputFile)) hmdb_kp_node = kg2_util.make_node(HMDB_PROVIDED_BY_CURIE_ID, HMDB_KB_IRI, "Human Metabolome Database", kg2_util.BIOLINK_CATEGORY_DATA_FILE, file_update_date, HMDB_PROVIDED_BY_CURIE_ID) nodes.append(hmdb_kp_node) print("Saving JSON at", date()) kg2_util.save_json({ "nodes": nodes, "edges": edges }, args.outputFile, args.test) print("Finished saving JSON at", date()) print("Script finished at", date())
def get_nodes(connection, test): nodes = [] # This MySQL query uses the StableIdentifier table, # which holds all of the node IDs for Reactome, as # its left most table. Then, it inner joins the # DatabaseObject table, which contains identifiers (called # the DB_ID) that can be linked to all of the other tables, # which the StableIdentifier can not be. Then, the # various node properties are added on using left joins. # In general, there are three types of nodes: events (which # includes pathways and reactions), physical entities (which # includes polymers, drugs, and complexes), and regulations. # The regulations are nodes that stand in for edges. As a result, # they are filtered out in category assignment. However, we retreive # them in this statement in case they are wanted later. # Each general node type has different table linkage to # retreive its publications and description. As a result, # this statement uses left joins, so that each node gets the # publications and description that fits it. However, # nodes can have more than one publication, so we have # to use group by and group concat to ensure that each node # is only included in the knowledge graph once and all of its # publications are on it. In addition, this statement includes # distinct when using group concat, because we don't need repeats # of the various fields, it is merely a way to collapse all iterations # of the node (because each publication creates a new row of the node) # into one. nodes_sql = "SELECT si.identifier as node_id, \ GROUP_CONCAT(DISTINCT dbobj._displayName) as node_name, \ GROUP_CONCAT(DISTINCT dbobj._timestamp) as update_date, \ GROUP_CONCAT(DISTINCT dbobj._class) as category, \ GROUP_CONCAT(DISTINCT lit_fr_e.pubMedIdentifier) as pmid_event, \ GROUP_CONCAT(DISTINCT lit_fr_p.pubMedIdentifier) as pmid_entity, \ GROUP_CONCAT(DISTINCT sum_fr_e.text) as description_event, \ GROUP_CONCAT(DISTINCT sum_fr_p.text) as description_entity, \ GROUP_CONCAT(DISTINCT sum_fr_r.text) as description_regulation, \ GROUP_CONCAT(DISTINCT ins_ed.dateTime) as created_date \ FROM StableIdentifier si \ INNER JOIN DatabaseObject dbobj \ ON si.DB_ID=dbobj.stableIdentifier \ LEFT JOIN InstanceEdit ins_ed \ ON dbobj.created=ins_ed.DB_ID \ LEFT JOIN Event_2_literatureReference ev_lit \ ON dbobj.DB_ID=ev_lit.DB_ID \ LEFT JOIN LiteratureReference lit_fr_e \ ON lit_fr_e.DB_ID=ev_lit.literatureReference \ LEFT JOIN Event_2_summation ev_sum \ ON ev_sum.DB_ID=dbobj.DB_ID \ LEFT JOIN Summation sum_fr_e \ ON ev_sum.summation=sum_fr_e.DB_ID \ LEFT JOIN PhysicalEntity_2_literatureReference pe_lit \ ON dbobj.DB_ID=pe_lit.DB_ID \ LEFT JOIN LiteratureReference lit_fr_p \ ON lit_fr_p.DB_ID=pe_lit.literatureReference \ LEFT JOIN PhysicalEntity_2_summation pe_sum \ ON dbobj.DB_ID=pe_sum.DB_ID \ LEFT JOIN Summation sum_fr_p \ ON pe_sum.summation = sum_fr_p.DB_ID \ LEFT JOIN Regulation_2_summation reg_sum \ on reg_sum.DB_ID=dbobj.DB_ID \ LEFT JOIN Summation sum_fr_r \ ON sum_fr_r.DB_ID=reg_sum.summation \ GROUP BY si.identifier" if test: nodes_sql += " LIMIT " + str(ROW_LIMIT_TEST_MODE) for result in run_sql(nodes_sql, connection): node_id = only_include_certain_species(kg2_util.CURIE_PREFIX_REACTOME + ':' + result[0]) if node_id is None: continue name = result[1] update_date = str(result[2]) try: category_label = match_reactome_category_to_biolink(result[3]) if category_label is None: continue except KeyError: print("Category for", result[3], "not in match_reactome_category_to_biolink") continue publications_event = result[4] publications_phy_ent = result[5] description_event = result[6] description_phy_ent = result[7] descrption_reg = result[8] iri = REACTOME_BASE_IRI + result[0] created_date = result[9] # Check to see which general type of node it is and generate the # publications list using that if publications_event is not None: publications = publications_event.split(',') publications = [ PMID_PREFIX + ':' + publication for publication in publications ] elif publications_phy_ent is not None: publications = publications_phy_ent.split(',') publications = [ PMID_PREFIX + ':' + publication for publication in publications ] else: publications = [] # Check to see which general type of node it is and assign the node's # description based on that if description_event is not None: description = description_event elif description_phy_ent is not None: description = description_phy_ent else: description = descrption_reg node = kg2_util.make_node(node_id, iri, name, category_label, update_date, REACTOME_KB_CURIE_ID) node['description'] = description node['publications'] = publications node['creation_date'] = str(created_date) nodes.append(node) return nodes
for edge in get_physical_entity_characteristics(connection, test): edges.append(edge) for edge in get_members_of_set(connection, test): edges.append(edge) for edge in get_species(connection, test): edges.append(edge) return edges if __name__ == '__main__': args = get_args() connection = pymysql.connect(read_default_file=args.mysqlConfigFile, db=args.mysqlDBName) run_sql("SET SESSION group_concat_max_len=35000", connection) run_sql("SET SESSION sort_buffer_size=256000000", connection) nodes = get_nodes(connection, args.test) edges = get_edges(connection, args.test) kp_node = kg2_util.make_node(REACTOME_KB_CURIE_ID, REACTOME_KB_IRI, 'Reactome', kg2_util.BIOLINK_CATEGORY_DATA_FILE, None, REACTOME_KB_CURIE_ID) nodes.append(kp_node) graph = {'nodes': nodes, 'edges': edges} kg2_util.save_json(graph, args.outputFile, args.test)
relation, relation_label, INTACT_KB_CURIE_ID, update_date) edge['publications'] = publications return edge return None if __name__ == '__main__': args = get_args() with open(args.inputFile, 'r') as intact: edges = [] nodes = [] edge_count = 0 for row in intact: edge = make_edge(row) if edge is not None and (args.test is False or edge_count < EDGE_LIMIT_TEST_MODE): edges.append(edge) edge_count += 1 kp_node = kg2_util.make_node(INTACT_KB_CURIE_ID, INTACT_KB_URI, "IntAct", kg2_util.BIOLINK_CATEGORY_DATA_FILE, None, INTACT_KB_CURIE_ID) nodes.append(kp_node) graph = {'edges': edges, 'nodes': nodes} kg2_util.save_json(graph, args.outputFile, args.test)
def make_nodes(records: list): ret_dict = {} for record_dict in records: xrefs = set() if 'CC' in record_dict: freetext_comments_str = record_dict['CC'] freetext_comments_list = list( map(lambda thestr: thestr.strip(), freetext_comments_str.split('-!-'))) for comment_str in freetext_comments_list: if comment_str.startswith( 'CATALYTIC ACTIVITY:') or comment_str.startswith( 'COFACTOR:'): xref_match_res = REGEX_XREF.search(comment_str) if xref_match_res is not None: xrefs |= set( filter(None, map(fix_xref, xref_match_res[1].split(',')))) accession_list = record_dict['AC'] accession = accession_list[0] synonyms = [] if len(accession_list) > 1: synonyms += accession_list[1:(len(accession_list) + 1)] description_list = record_dict['DE'] full_name = None short_name = None desc_ctr = 0 description = record_dict.get('CC', '') for description_str in description_list: description_str = description_str.lstrip() if description_str.startswith('RecName: '): full_name = description_str.replace('RecName: Full=', '') if desc_ctr < len(description_list) - 1: next_desc = description_list[desc_ctr + 1].lstrip() if next_desc.startswith('Short='): short_name = next_desc.replace('Short=', '') synonyms += [short_name] # continue elif description_str.startswith('AltName: Full='): synonyms.append(description_str.replace('AltName: Full=', '')) elif description_str.startswith('AltName: CD_antigen='): synonyms.append( description_str.replace('AltName: CD_antigen=', '')) elif description_str.startswith('EC='): ec_match = REGEX_EC_XREF.search(description_str) if ec_match is not None: xrefs.add(kg2_util.CURIE_PREFIX_KEGG + ':' + 'EC:' + ec_match[1]) elif not description_str.startswith( 'Flags:') and not description_str.startswith('Contains:'): description += '; ' + description_str desc_ctr += 1 date_fields = record_dict['DT'] date_ctr = 0 creation_date = None update_date = None for date_str_raw in date_fields: date_str = date_str_raw.split(',')[0] if date_ctr == 0: creation_date = date_str if date_ctr == len(date_fields) - 1: update_date = date_str date_ctr += 1 publications_raw = record_dict.get('RX', None) publications = [] if publications_raw is not None: for pub in publications_raw.split(';'): pub = pub.strip() if len(pub) > 0: publications.append( pub.replace('=', ':').replace( 'PubMed:', kg2_util.CURIE_PREFIX_PMID + ':')) else: publications = [] assert type(publications) == list assert type(description) == str publications += [ pub.replace('PubMed:', kg2_util.CURIE_PREFIX_PMID + ':') for pub in REGEX_PUBLICATIONS.findall(description) ] publications = sorted(list(set(publications))) gene_names_str = record_dict.get('GN', None) gene_symbol = None if gene_names_str is not None: gene_names_str_list = gene_names_str.split(';') for gene_names_str_item in gene_names_str_list: gene_names_match = REGEX_GENE_NAME.match(gene_names_str_item) if gene_names_match is not None: gene_symbol = gene_names_match[1] synonyms.insert(0, gene_symbol) else: gene_synonyms_match = REGEX_GENE_SYNONYMS.match( gene_names_str_item) if gene_synonyms_match is not None: # evidence codes from gene synonyms are not preserved synonyms += [ seperate_evidence_codes(syn)[0].strip() for syn in gene_synonyms_match[1].split(',') ] if gene_symbol is not None: name = gene_symbol else: if short_name is not None: name = short_name else: name = full_name # move evidence codes from name to description (issue #1171) name, ev_codes = seperate_evidence_codes(name) description += f"Evidence Codes from Name: {ev_codes} " # append species name to name if not human (issue #1171) species = record_dict.get('OS', 'unknown species').rstrip(".") if "h**o sapiens (human)" not in species.lower(): name += f" ({species})" node_curie = kg2_util.CURIE_PREFIX_UNIPROT + ':' + accession iri = UNIPROTKB_IDENTIFIER_BASE_IRI + accession category_label = kg2_util.BIOLINK_CATEGORY_PROTEIN node_dict = kg2_util.make_node(node_curie, iri, name, category_label, update_date, UNIPROTKB_PROVIDED_BY_CURIE_ID) node_dict['full_name'] = full_name if not description.endswith(' '): description += ' ' sequence = record_dict.get('SQ', '').strip('SEQUENCE ') node_dict['has_biological_sequence'] = sequence description = description.replace(LICENSE_TEXT, '') node_dict['description'] = description if len(synonyms) > 0: synonyms = [synonyms[0]] + list(set(synonyms) - {synonyms[0]}) node_dict['synonym'] = synonyms node_dict['publications'] = publications node_dict['creation_date'] = creation_date if len(xrefs) == 0: xrefs = None node_dict['xrefs'] = xrefs ret_dict[node_curie] = node_dict return ret_dict
node_dict['creation_date'] = creation_date if len(xrefs) == 0: xrefs = None node_dict['xrefs'] = xrefs ret_dict[node_curie] = node_dict return ret_dict # --------------- main starts here ------------------- if __name__ == '__main__': args = make_arg_parser().parse_args() test_mode = args.test input_file_name = args.inputFile output_file_name = args.outputFile [uniprot_records, update_date] = parse_records_from_uniprot_dat(input_file_name, DESIRED_SPECIES_INTS, test_mode) nodes_dict = make_nodes(uniprot_records) ontology_curie_id = UNIPROTKB_PROVIDED_BY_CURIE_ID ont_node = kg2_util.make_node(ontology_curie_id, UNIPROT_KB_URL, 'UniProtKB', kg2_util.BIOLINK_CATEGORY_DATA_FILE, update_date, ontology_curie_id) nodes_list = [ont_node] + [node_dict for node_dict in nodes_dict.values()] edges_list = make_edges(uniprot_records, nodes_dict) output_graph = {'nodes': nodes_list, 'edges': edges_list} kg2_util.save_json(output_graph, output_file_name, test_mode)
def make_kg2_graph(input_file_name: str, test_mode: bool = False): nodes = [] edges = [] gene_ctr = 0 update_date = os.path.getmtime(input_file_name) ontology_curie_id = NCBI_KB_CURIE_ID ens_kp_node = kg2_util.make_node(ontology_curie_id, NCBI_KB_URL, 'NCBI Genes', kg2_util.BIOLINK_CATEGORY_DATA_FILE, update_date, ontology_curie_id) nodes.append(ens_kp_node) with open(input_file_name, 'r') as input_file: for line in input_file: if line.startswith('#'): continue gene_ctr += 1 if test_mode and gene_ctr > 10000: break fields = line.rstrip("\n").split("\t") fields = [(field if field.strip() != '-' else None) for field in fields] [ taxon_id_str, ncbi_gene_id, gene_symbol, locus_tag, synonyms_str, db_xrefs, chromosome, map_location, description, type_of_gene, symbol_auth, full_name_auth, nomenc_status, other_desig, modify_date, feature_type ] = fields taxon_id_int = int(taxon_id_str) if taxon_id_int != kg2_util.NCBI_TAXON_ID_HUMAN: # skip neanderthal- and denisovan-specific genes continue node_synonyms = list() if synonyms_str is not None: node_synonyms += synonyms_str.split('|') if other_desig is not None: node_synonyms += other_desig.split('|') if symbol_auth is not None and symbol_auth != gene_symbol: node_synonyms = [symbol_auth] + node_synonyms node_synonyms = list(set(node_synonyms)) full_name = full_name_auth if full_name is None: full_name = description if type_of_gene != "unknown" or (db_xrefs is None) or (not db_xrefs.startswith("MIM:")) or \ nomenc_status is not None: category_label = kg2_util.BIOLINK_CATEGORY_GENE else: full_name = 'Genetic locus associated with ' + full_name category_label = kg2_util.BIOLINK_CATEGORY_GENOMIC_ENTITY if full_name.startswith('microRNA'): category_label = kg2_util.BIOLINK_CATEGORY_MICRORNA node_dict = make_node(ncbi_gene_id, full_name, gene_symbol, modify_date, category_label, node_synonyms) node_curie_id = node_dict['id'] type_str = 'Type:' + type_of_gene node_description = '' if description is not None and description != full_name_auth: node_description = description + '; ' node_description += type_str if nomenc_status is not None: nomenc_tag = 'official' else: nomenc_tag = 'unofficial' if map_location is not None: node_description += '; Locus:' + map_location node_description += '; NameStatus:' + nomenc_tag node_dict['description'] = node_description nodes.append(node_dict) org_curie = kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + taxon_id_str predicate_label = 'in_taxon' edge_dict = kg2_util.make_edge_biolink(node_curie_id, org_curie, predicate_label, NCBI_KB_CURIE_ID, modify_date) edges.append(edge_dict) if db_xrefs is not None: xrefs_list = db_xrefs.split('|') for xref_curie in xrefs_list: if xref_curie.startswith('HGNC:HGNC:'): xref_curie = kg2_util.CURIE_PREFIX_HGNC + ':' + xref_curie.replace( 'HGNC:', '') elif xref_curie.startswith('Ensembl:'): xref_curie = xref_curie.upper() elif xref_curie.startswith('MIM:'): xref_curie = kg2_util.CURIE_PREFIX_OMIM + ':' + xref_curie.replace( 'MIM:', '') elif xref_curie.startswith('miRBase:'): xref_curie = kg2_util.CURIE_PREFIX_MIRBASE + ':' + xref_curie.replace( 'miRBase:', '') edges.append( kg2_util.make_edge(node_curie_id, xref_curie, kg2_util.CURIE_ID_OWL_SAME_AS, kg2_util.EDGE_LABEL_OWL_SAME_AS, NCBI_KB_CURIE_ID, modify_date)) return {'nodes': nodes, 'edges': edges}
arg_parser.add_argument('inputFile', type=str) arg_parser.add_argument('outputFile', type=str) return arg_parser if __name__ == '__main__': args = make_arg_parser().parse_args() input_file_name = args.inputFile output_file_name = args.outputFile test_mode = args.test edges = [] nodes = [] file_update_date = kg2_util.convert_date(os.path.getmtime(args.inputFile)) unichem_kp_node = kg2_util.make_node(UNICHEM_KB_CURIE, UNICHEM_KB_IRI, "UniChem database", kg2_util.BIOLINK_CATEGORY_DATA_FILE, file_update_date, UNICHEM_KB_CURIE) nodes.append(unichem_kp_node) update_date = None line_ctr = 0 with open(input_file_name, 'r') as input_file: for line in input_file: if line.startswith('#'): update_date = line.split('# ')[1].rstrip() continue line_ctr += 1 if test_mode and line_ctr > 10000: break (subject_curie_id, object_curie_id) = line.rstrip().split('\t') edges.append(
def make_kg2_graph(input_file_name: str, test_mode: bool = False): nodes = [] edges = [] line_ctr = 0 update_date = None with open(input_file_name, 'r') as input_file: for line in input_file: line = line.rstrip("\n") if line.startswith('#'): update_date = line.replace('#', '') continue if line.startswith('gene_name\t'): continue line_ctr += 1 if test_mode and line_ctr > 10000: break fields = line.split("\t") [gene_name, gene_claim_name, entrez_id, interaction_claim_source, interaction_types, drug_claim_name, drug_claim_primary_name, drug_name, drug_chembl_id, PMIDs] = fields if entrez_id != "": object_curie_id = 'NCBIGene:' + entrez_id if drug_chembl_id != "": subject_curie_id = 'CHEMBL.COMPOUND:' + drug_chembl_id else: if drug_claim_name != "": node_pubs_list = [] subject_curie_id = None if interaction_claim_source == "GuideToPharmacologyInteractions": subject_curie_id = GTPI_CURIE_PREFIX + ':' + drug_claim_name pmid_match = RE_PMID.match(drug_claim_primary_name) if pmid_match is not None: node_pubs_list = [pmid_match[2].replace(' ', '').strip()] node_name = pmid_match[1].strip() else: node_name = drug_claim_primary_name node_iri = GTPI_IRI_BASE + GTPI_LIGAND_SUFFIX + drug_claim_name provided_by = GTPI_IRI_BASE elif interaction_claim_source == "TTD": subject_curie_id = TTD_CURIE_PREFIX + ':' + drug_claim_name node_name = drug_claim_primary_name node_iri = TTD_IRI_BASE + drug_claim_name provided_by = TTD_IRI_BASE if subject_curie_id is not None: node_dict = kg2_util.make_node(subject_curie_id, node_iri, node_name, 'chemical_substance', update_date, provided_by) node_dict['publications'] = node_pubs_list nodes.append(node_dict) if subject_curie_id is None: print("DGIDB: no controlled ID was provided for this drug: " + drug_claim_primary_name + "; source DB: " + interaction_claim_source, file=sys.stderr) continue if interaction_types == "": interaction_types = "affects" pmids_list = [] if PMIDs.strip() != "": pmids_list = [('PMID:' + pmid.strip()) for pmid in PMIDs.split(',')] interaction_list = interaction_types.split(',') for interaction in interaction_list: interaction = interaction.replace(' ', '_') edge_dict = kg2_util.make_edge(subject_curie_id, object_curie_id, DGIDB_BASE_IRI + '/' + kg2_util.convert_snake_case_to_camel_case(interaction), DGIDB_CURIE_PREFIX + ':' + interaction, interaction, DGIDB_BASE_IRI, update_date) edge_dict['publications'] = pmids_list edges.append(edge_dict) return {'nodes': nodes, 'edges': edges}
def make_kg2_graph(input_file_name: str, test_mode: bool = False): nodes = [] edges = [] line_ctr = 0 update_date = None with open(input_file_name, 'r') as input_file: for line in input_file: line = line.rstrip("\n") if line.startswith('#'): update_date = line.replace('#', '') continue if line.startswith('gene_name\t'): continue line_ctr += 1 if test_mode and line_ctr > 10000: break fields = line.split("\t") [gene_name, gene_claim_name, entrez_id, interaction_claim_source, interaction_types, drug_claim_name, drug_claim_primary_name, drug_name, drug_concept_id, _, #12.5.2020 new field in tsv: interaction group score PMIDs] = fields if entrez_id != "": object_curie_id = kg2_util.CURIE_PREFIX_NCBI_GENE + ':' + entrez_id if drug_concept_id != "": if "chembl" in drug_concept_id: _, chembl_id = drug_concept_id.split(":") subject_curie_id = kg2_util.CURIE_PREFIX_CHEMBL_COMPOUND + ':' + chembl_id else: print(f"DGIDB: Skipping row with drug concept id {drug_concept_id}", file=sys.stderr) continue #skipping over wikidata nodes, see #1185 else: if drug_claim_name != "": node_pubs_list = [] subject_curie_id = None if interaction_claim_source == INTERACTION_CLAIM_SOURCE_GTPI: subject_curie_id = GTPI_CURIE_PREFIX + ':' + drug_claim_name pmid_match = RE_PMID.match(drug_claim_primary_name) if pmid_match is not None: node_pubs_list = [pmid_match[2].replace(' ', '').strip()] node_name = pmid_match[1].strip() else: node_name = drug_claim_primary_name node_iri = GTPI_BASE_URL + drug_claim_name provided_by = GTPI_KB_CURIE elif interaction_claim_source == INTERACTION_CLAIM_SOURCE_TTD: subject_curie_id = TTD_CURIE_PREFIX + ':' + drug_claim_name node_name = drug_claim_primary_name node_iri = TTD_IRI_BASE + drug_claim_name provided_by = TTD_KB_CURIE if subject_curie_id is not None: node_dict = kg2_util.make_node(subject_curie_id, node_iri, node_name, kg2_util.BIOLINK_CATEGORY_CHEMICAL_SUBSTANCE, update_date, provided_by) node_dict['publications'] = node_pubs_list nodes.append(node_dict) if subject_curie_id is None: print("DGIDB: no controlled ID was provided for this drug: " + drug_claim_primary_name + "; source DB: " + interaction_claim_source, file=sys.stderr) continue if interaction_types == "": print("DGIDB: interaction type was empty. Setting to 'affects'.", file=sys.stderr) interaction_types = "affects" pmids_list = [] if PMIDs.strip() != "": pmids_list = [(kg2_util.CURIE_PREFIX_PMID + ':' + pmid.strip()) for pmid in PMIDs.split(',')] interaction_list = interaction_types.split(',') for interaction in interaction_list: interaction = interaction.replace(' ', '_') edge_dict = kg2_util.make_edge(subject_curie_id, object_curie_id, DGIDB_CURIE_PREFIX + ':' + interaction, interaction, DGIDB_KB_CURIE, update_date) edge_dict['publications'] = pmids_list edges.append(edge_dict) return {'nodes': nodes, 'edges': edges}
[data, update_date] = make_node_and_edges(article, mesh_predicate_label) for node in data["nodes"]: nodes.append(node) for edge in data["edges"]: edges.append(edge) if date_to_num(update_date) > latest_date: latest_date = date_to_num(update_date) latest_date = { "Year": str(latest_date)[0:4], "Month": str(latest_date)[4:6], "Day": str(latest_date)[6:] } pmid_kp_node = kg2_util.make_node(PMID_PROVIDED_BY_CURIE_ID, PMID_KB_IRI, "PubMed", kg2_util.BIOLINK_CATEGORY_DATA_FILE, extract_date(latest_date), PMID_PROVIDED_BY_CURIE_ID) nodes.append(pmid_kp_node) print("Saving JSON:", date()) kg2_util.save_json({ "nodes": nodes, "edges": edges }, args.outputFile, args.test) print("Finished saving JSON:", date()) del nodes del edges print("Script Finished:", date())
else: edge = kg2_util.make_edge_biolink( node_id, xref_id, kg2_util.EDGE_LABEL_BIOLINK_RELATED_TO, MIRBASE_KB_CURIE_ID, None) edges.append(edge) taxon_edge_count = 0 for node_id in nodes_to_species: taxon_edge_count += 1 if test_mode and taxon_edge_count > 1000: break taxon_edge = kg2_util.make_edge_biolink( node_id, nodes_to_species[node_id], kg2_util.EDGE_LABEL_BIOLINK_IN_TAXON, MIRBASE_KB_CURIE_ID, None) edges.append(taxon_edge) return edges if __name__ == '__main__': args = get_args() with open(args.inputFile, 'r') as mirbase: entries = format_data(mirbase) kp_node = kg2_util.make_node(MIRBASE_KB_CURIE_ID, MIRBASE_KB_URL, 'miRBase', kg2_util.BIOLINK_CATEGORY_DATA_FILE, None, MIRBASE_KB_CURIE_ID) [nodes, xrefs, nodes_to_species] = make_nodes(entries, args.test) nodes.append(kp_node) edges = make_edges(xrefs, nodes_to_species, args.test) graph = {'nodes': nodes, 'edges': edges} kg2_util.save_json(graph, args.outputFile, args.test)