def make_edges(records: list, nodes_dict: dict): ret_list = [] for record_dict in records: accession = record_dict['AC'][0] curie_id = 'UniProtKB:' + accession organism_int = record_dict['organism'] update_date = nodes_dict[curie_id]['update date'] ret_list.append( kg2_util.make_edge(curie_id, 'NCBITaxon:' + str(organism_int), 'gene_product_has_organism_source', UNIPROTKB_BASE_IRI, update_date)) record_xrefs = record_dict.get('DR', None) if record_xrefs is not None: for xref_str in record_xrefs: hgnc_match = REGEX_HGNC.match(xref_str) if hgnc_match is not None: hgnc_curie = hgnc_match[1] ret_list.append( kg2_util.make_edge(hgnc_curie, curie_id, 'encodes', UNIPROTKB_BASE_IRI, update_date)) gene_id_match = REGEX_NCBIGeneID.match(xref_str) if gene_id_match is not None: ncbi_curie = 'NCBIGene:' + gene_id_match[1] ret_list.append( kg2_util.make_edge(ncbi_curie, curie_id, 'encodes', UNIPROTKB_BASE_IRI, update_date)) return ret_list
def make_kg2_graph(input_file_name: str, test_mode: bool = False): nodes = [] edges = [] df = pd.read_csv(input_file_name) for idx in range(len(df)): if not df['status'].isna()[idx]: status = df['status'][idx].lower() else: status = "unknown_status" if not df['phase'].isna()[idx]: phase = df['phase'][idx].lower().replace(" ", "_").replace("/", "_or_") else: phase = "unknown_phase" relation = "clinically_tested_" + status + "_" + phase edge_dict = kg2_util.make_edge( subject_id=DRUGBANK_CURIE + df['drug_id'][idx], object_id=UMLS_CURIE + df['ind_id'][idx], relation=REPODB_IRI + '#' + kg2_util.convert_snake_case_to_camel_case(relation), relation_curie=REPODB_CURIE + relation, predicate_label=relation, provided_by=REPODB_IRI, update_date=None) if not df['NCT'].isna()[idx]: edge_dict['publications'].append(NCT_CUTRIE + df['NCT'][idx]) edge_dict['publications info'][ NCT_CUTRIE + df['NCT'][idx]] = CLINICALTRIALS_IRI + df['NCT'][idx] edges.append(edge_dict) return {'nodes': nodes, 'edges': edges}
def make_rel(preds_dict: dict, subject_curie: str, object_curie: str, predicate: str, pmid: str, pub_date: str, sentence: str, subject_score: str, object_score: str, negated: bool): key = subject_curie + '-' + predicate + '-' + object_curie key_val = preds_dict.get(key, None) publication_curie = kg2_util.CURIE_PREFIX_PMID + ':' + pmid publication_info_dict = { 'publication date': pub_date, 'sentence': sentence, 'subject score': subject_score, 'object score': object_score } if key_val is None: relation_type = predicate.lower() if relation_type != 'xref': relation_curie = SEMMEDDB_CURIE_PREFIX + ':' + relation_type else: relation_curie = 'OBO:xref' edge_dict = kg2_util.make_edge(subject_curie, object_curie, relation_curie, relation_type, SEMMEDDB_CURIE_PREFIX + ':', curr_timestamp) edge_dict['publications'] = [publication_curie] edge_dict['publications_info'] = { publication_curie: publication_info_dict } edge_dict['negated'] = negated preds_dict[key] = edge_dict else: key_val['publications_info'][publication_curie] = publication_info_dict key_val['publications'] = key_val['publications'] + [publication_curie]
def make_edge(subject_curie_id: str, object_curie_id: str, predicate_label: str, update_date: str): [relation, relation_curie] = kg2_util.predicate_label_to_iri_and_curie( predicate_label, ENSEMBL_RELATION_CURIE_PREFIX, ENSEMBL_KB_IRI) rel = kg2_util.make_edge(subject_curie_id, object_curie_id, relation, relation_curie, predicate_label, ENSEMBL_KB_IRI, update_date) return rel
def format_edge(subject_id, object_id, predicate, update_date): relation_curie = kg2_util.predicate_label_to_curie( predicate, DRUGCENTRAL_RELATION_CURIE_PREFIX) if predicate == kg2_util.EDGE_LABEL_BIOLINK_SAME_AS: return kg2_util.make_edge_biolink(subject_id, object_id, predicate, DRUGCENTRAL_SOURCE, update_date) else: return kg2_util.make_edge(subject_id, object_id, relation_curie, predicate, DRUGCENTRAL_SOURCE, update_date)
def format_edge(subject_id: str, object_id: str, predicate_label: str): relation_curie = kg2_util.predicate_label_to_curie( predicate_label, REACTOME_RELATION_CURIE_PREFIX) if predicate_label == kg2_util.EDGE_LABEL_BIOLINK_SAME_AS: return kg2_util.make_edge_biolink(subject_id, object_id, predicate_label, REACTOME_KB_CURIE_ID, None) return kg2_util.make_edge(subject_id, object_id, relation_curie, predicate_label, REACTOME_KB_CURIE_ID)
def make_edge(subject_curie_id: str, object_curie_id: str, predicate_label: str, update_date: str): relation = kg2_util.BIOLINK_CATEGORY_BASE_IRI + kg2_util.convert_snake_case_to_camel_case( predicate_label) relation_curie = kg2_util.BIOLINK_CURIE_PREFIX + ':' + predicate_label.replace( ' ', '_') rel = kg2_util.make_edge(subject_curie_id, object_curie_id, relation, relation_curie, predicate_label, UNIPROTKB_BASE_IRI, update_date) return rel
def make_edges(input_tsv: str, gene_id_dict: Dict[str, list], pmids_dict: Dict[str, Dict[str, set]], test_mode: bool) -> list: gene_ids_actually_used = set() update_date = datetime.datetime.now().replace(microsecond=0).isoformat() with open(input_tsv) as inp: tsvin = csv.reader(inp, delimiter="\t") edges = list() for row in tsvin: [ gene_id, gene_name, disease_id, disease_name, z_score, _, source_url ] = row gene_ids_actually_used.add(gene_id) kg2_gene_id_list = gene_id_dict.get(gene_id, None) if kg2_gene_id_list is None: # print(f"Missing kg2 equivalent gene ids for {gene_id}. Skipping") continue if float(z_score) < 3.0: continue for kg2_gene_id in kg2_gene_id_list: if pmids_dict['disease'].get(disease_id, None) is None: # print(f"Disease id {disease_id} is not DOID. Skipping.") continue publications_list = list( pmids_dict['gene'][gene_id].intersection( pmids_dict['disease'][disease_id])) publications_list = publications_list[: 30] # limit number of publications to 30 for size constraints edge = kg2_util.make_edge(kg2_gene_id, disease_id, "JensenLab:associated_with", "associated_with", kg2_util.CURIE_ID_JENSENLAB, update_date) # seems hacky, but following example in rtx_kg1_neo4j_to_kg_json.py publication_info_dict = { 'publication date': None, 'sentence': None, 'subject score': None, 'object score': str(z_score) } publications_info = {edge['object']: publication_info_dict} edge["publications"] = publications_list edge["publications_info"] = publications_info edges.append(edge) if test_mode and len(gene_ids_actually_used) > 1000: break used_genes_missing_ids = gene_ids_actually_used - set(gene_id_dict.keys()) print( f"Skipped {len(used_genes_missing_ids)} rows for lack of kg2 gene ids." ) print( f"Found {len(gene_ids_actually_used - used_genes_missing_ids)} used kg2 gene ids." ) return edges
def make_node_and_edges(article: dict, mesh_predicate_label: str, mesh_relation_curie: str): nodes = [] edges = [] article_citation = article["MedlineCitation"] pmid = kg2_util.CURIE_PREFIX_PMID + ":" + article_citation["PMID"]["#text"] update_date = extract_date(article_citation["DateRevised"]) if pmid in pmids: # These aren't necessary yet, but it might be someday, so I wrote # and tested a couple of functions to extract them #authors = get_authors(article_citation) #journal = get_journal(article_citation) name = article_citation["Article"]["ArticleTitle"] if isinstance(name, dict): try: name = name["#text"] except: temp_name = name for key in temp_name: name = temp_name[key]["#text"] try: created_date = extract_date( article_citation["Article"]["ArticleDate"]) except: created_date = None iri = PMID_BASE_IRI + article_citation["PMID"]["#text"] node = kg2_util.make_node(pmid, iri, name, BIOLINK_CATEGORY_PUBLICATION, update_date, PMID_PROVIDED_BY_CURIE_ID) node["creation_date"] = created_date nodes.append(node) try: for mesh_topic in ( article_citation["MeshHeadingList"]["MeshHeading"]): mesh_id = kg2_util.CURIE_PREFIX_MESH + ":" + \ mesh_topic["DescriptorName"]["@UI"] edge = kg2_util.make_edge(pmid, mesh_id, mesh_relation_curie, mesh_predicate_label, PMID_PROVIDED_BY_CURIE_ID, update_date) edges.append(edge) except: mesh_id = None return [{"nodes": nodes, "edges": edges}, update_date]
def make_kg2_graph(input_file_name: str, test_mode: bool = False): ensembl_data = kg2_util.load_json(input_file_name) nodes = [] edges = [] genebuild_str = ensembl_data['genebuild'] update_date = genebuild_str.split('/')[1] gene_ctr = 0 for gene_dict in ensembl_data['genes']: gene_ctr += 1 if test_mode and gene_ctr > 10000: break ensembl_gene_id = gene_dict['id'] description = gene_dict.get('description', None) gene_symbol = gene_dict.get('name', None) other_synonyms = [] xrefs = gene_dict.get('xrefs', None) if xrefs is not None: other_synonyms = list(set([xref['primary_id'] for xref in xrefs if xref['primary_id'] != ensembl_gene_id])) node_dict = make_node(ensembl_gene_id, description, gene_symbol, update_date, other_synonyms) nodes.append(node_dict) ensembl_gene_curie_id = node_dict['id'] taxon_id_int = gene_dict.get('taxon_id', None) assert taxon_id_int == 9606, "unexpected taxon ID" edges.append(kg2_util.make_edge(ensembl_gene_curie_id, 'NCBITaxon:' + str(taxon_id_int), 'gene_found_in_organism', ENSEMBL_KB_IRI, update_date)) hgnc_list = gene_dict.get('HGNC', None) if hgnc_list is not None: for hgnc_curie in hgnc_list: edges.append(kg2_util.make_edge(ensembl_gene_curie_id, hgnc_curie, 'xref', ENSEMBL_KB_IRI, update_date)) return {'nodes': nodes, 'edges': edges}
def make_edge(subject_id: str, object_id: str, predicate_label: str, update_date: str = None, publications: list = None): relation_curie = kg2_util.CURIE_PREFIX_CHEMBL_MECHANISM + ':' + predicate_label edge = kg2_util.make_edge(subject_id, object_id, relation_curie, predicate_label, CHEMBL_KB_CURIE_ID, update_date) edge['publications'] = [] if publications is None else publications edge['publications_info'] = {} return edge
def make_kg2_graph(input_file_name: str, test_mode: bool = False): ensembl_data = kg2_util.load_json(input_file_name) nodes = [] edges = [] genebuild_str = ensembl_data['genebuild'] update_date = genebuild_str.split('/')[1] gene_ctr = 0 ontology_curie_id = ENSEMBL_KB_CURIE_ID ens_kp_node = kg2_util.make_node(ontology_curie_id, ENSEMBL_KB_URI, 'Ensembl Genes', kg2_util.BIOLINK_CATEGORY_DATA_FILE, update_date, ontology_curie_id) nodes.append(ens_kp_node) for gene_dict in ensembl_data['genes']: gene_ctr += 1 if test_mode and gene_ctr > 10000: break ensembl_gene_id = gene_dict['id'] description = gene_dict.get('description', None) gene_symbol = gene_dict.get('name', None) other_synonyms = [] xrefs = gene_dict.get('xrefs', None) if xrefs is not None: other_synonyms = list( set([ xref['primary_id'] for xref in xrefs if xref['primary_id'] != ensembl_gene_id ])) node_dict = make_node(ensembl_gene_id, description, gene_symbol, update_date, other_synonyms) nodes.append(node_dict) ensembl_gene_curie_id = node_dict['id'] taxon_id_int = gene_dict.get('taxon_id', None) assert taxon_id_int == 9606, "unexpected taxon ID" edges.append( kg2_util.make_edge_biolink( ensembl_gene_curie_id, kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + str(taxon_id_int), kg2_util.EDGE_LABEL_BIOLINK_IN_TAXON, ENSEMBL_KB_CURIE_ID, update_date)) hgnc_list = gene_dict.get('HGNC', None) if hgnc_list is not None: for hgnc_curie in hgnc_list: edges.append( kg2_util.make_edge(ensembl_gene_curie_id, hgnc_curie, kg2_util.CURIE_ID_OWL_SAME_AS, kg2_util.EDGE_LABEL_OWL_SAME_AS, ENSEMBL_KB_CURIE_ID, update_date)) return {'nodes': nodes, 'edges': edges}
def format_edge(subject_id: str, object_id: str, predicate_label: str, description: str, publications: list = None): relation_curie = kg2_util.predicate_label_to_curie( predicate_label, DRUGBANK_RELATION_CURIE_PREFIX) edge = kg2_util.make_edge(subject_id, object_id, relation_curie, predicate_label, DRUGBANK_KB_CURIE_ID, None) if description is not None: edge["publications_info"] = {"sentence": description} if publications is not None: edge["publications"] = publications return edge
def make_hmdb_edge(subject_id: str, object_id: str, subject_prefix: str, object_prefix: str, predicate_label: str, update_date: str, publications_info: dict): relation_curie = kg2_util.predicate_label_to_curie(predicate_label, CURIE_PREFIX_HMDB) subject = subject_prefix + ":" + subject_id object = object_id if object_prefix is not None: object = object_prefix + ":" + object_id if predicate_label == kg2_util.EDGE_LABEL_BIOLINK_SAME_AS: edge = kg2_util.make_edge_biolink(subject, object, predicate_label, HMDB_PROVIDED_BY_CURIE_ID, update_date) else: edge = kg2_util.make_edge(subject, object, relation_curie, predicate_label, HMDB_PROVIDED_BY_CURIE_ID, update_date) edge["publications_info"] = publications_info return edge
def make_kg2_graph(input_file_name: str, test_mode: bool = False): update_date = os.path.getmtime(input_file_name) nodes = [ kg2_util.make_node(id=REPODB_CURIE + ':', iri=REPODB_IRI, name='repoDB drug repositioning database', category_label=kg2_util.BIOLINK_CATEGORY_DATA_FILE, update_date=update_date, provided_by=REPODB_CURIE + ':') ] edges = [] df = pd.read_csv(input_file_name) for idx in range(len(df)): if not df['status'].isna()[idx]: status = df['status'][idx].lower() else: status = "unknown_status" if not df['phase'].isna()[idx]: phase = df['phase'][idx].lower().replace(" ", "_").replace("/", "_or_") else: phase = "unknown_phase" relation = "clinically_tested_" + status + "_" + phase edge_dict = kg2_util.make_edge( subject_id=DRUGBANK_CURIE + ':' + df['drug_id'][idx], object_id=UMLS_CURIE + ':' + df['ind_id'][idx], relation_curie=REPODB_CURIE + ':' + relation, relation_label=relation, provided_by=REPODB_CURIE + ':', update_date=None) if not df['NCT'].isna()[idx]: edge_dict['publications'].append(NCT_CURIE + df['NCT'][idx]) edge_dict['publications_info'][ NCT_CURIE + df['NCT'][idx]] = CLINICALTRIALS_IRI + df['NCT'][idx] edges.append(edge_dict) return {'nodes': nodes, 'edges': edges}
def make_rel(preds_dict: dict, subject_curie: str, object_curie: str, predicate: str, pmid: str, pub_date: str, sentence: str, subject_score: str, object_score: str, negated: bool): key = subject_curie + '-' + predicate + '-' + object_curie key_val = preds_dict.get(key, None) publication_curie = 'PMID:' + pmid publication_info_dict = { 'publication date': pub_date, 'sentence': sentence, 'subject score': subject_score, 'object score': object_score } if key_val is None: relation_type = predicate.lower() if relation_type != 'xref': relation_iri = kg2_util.convert_snake_case_to_camel_case( relation_type.replace(' ', '_')) relation_iri = relation_iri[0].lower() + relation_iri[1:] relation_iri = SEMMEDDB_IRI + '#' + relation_iri relation_curie = 'SEMMEDDB:' + relation_type else: relation_curie = 'OBO:xref' relation_iri = prefixcommons.expand_uri(relation_curie) edge_dict = kg2_util.make_edge(subject_curie, object_curie, relation_iri, relation_curie, relation_type, SEMMEDDB_IRI, curr_timestamp) edge_dict['publications'] = [publication_curie] edge_dict['publications info'] = { publication_curie: publication_info_dict } edge_dict['negated'] = negated preds_dict[key] = edge_dict else: key_val['publications info'][publication_curie] = publication_info_dict key_val['publications'] = key_val['publications'] + [publication_curie]
def make_xref(subject: str, object: str, update_date: str): edge_dict = kg2_util.make_edge(subject, object, kg2_util.IRI_OWL_SAME_AS, kg2_util.CURIE_OWL_SAME_AS, 'equivalent_to', UNICHEM_KB_IRI, update_date) return edge_dict
def make_edge(intact_row): if row.startswith('#'): return None data = row.split('\t') # last data element is 'Identification method participant B' [subject_id, # ID(s) interactor A object_id, # ID(s) interactor B, _, # Alt. ID(s) interactor A, _, # Alt. ID(s) interactor B, subject_name, # Alias(es) interactor A, object_name, # Alias(es) interactor B, _, # Interaction detection method(s), _, # Publication 1st author(s), publications, # Publication Identifier(s), subject_taxon, # Taxid interactor A, object_taxon, # Taxid interactor B, predicate, # Interaction type(s), _, # Source database(s), _, # Interaction identifier(s), confidence, # Confidence value(s), _, # Expansion method(s), _, # Biological role(s) interactor A, _, # Biological role(s) interactor B, _, # Experimental role(s) interactor A, _, # Experimental role(s) interactor B, _, # Type(s) interactor A, _, # Type(s) interactor B, _, # Xref(s) interactor A, _, # Xref(s) interactor B, _, # Interaction Xref(s), _, # Annotation(s) interactor A, _, # Annotation(s) interactor B, _, # Interaction annotation(s), taxon, # Host organism(s), _, # Interaction parameter(s), created_date, # Creation date, update_date, # Update date, _, # Checksum(s) interactor A, _, # Checksum(s) interactor B, _, # Interaction Checksum(s), _, # Negative, _, # Feature(s) interactor A, _, # Feature(s) interactor B, _, # Stoichiometry(s) interactor A, _, # Stoichiometry(s) interactor B, _, # Identification method participant A _] = data if subject_taxon == HUMAN_TAXON and object_taxon == HUMAN_TAXON: publications = [format_pmid(publication) for publication in publications.split('|') if publication.startswith('pubmed')] confidence = [score.replace('intact-miscore:', '') for score in confidence.split('|') if confidence.startswith('intact-miscore:')] if len(confidence) < 1: confidence = None else: confidence = confidence[0] relation_label = format_rel_label(predicate) relation = predicate.split('"')[1] subject_id = subject_id.replace('uniprotkb', kg2_util.CURIE_PREFIX_UNIPROT) object_id = object_id.replace('uniprotkb', kg2_util.CURIE_PREFIX_UNIPROT) created_date = format_date(created_date) update_date = format_date(update_date) edge = kg2_util.make_edge(subject_id, object_id, relation, relation_label, INTACT_KB_CURIE_ID, update_date) edge['publications'] = publications return edge return None
for line in file_arr: if line[0].startswith("!") is False: predicate_label = line[2] subject_curie = kg2_util.CURIE_PREFIX_UNIPROT + ":" + line[1] object_curie = line[3] publications = [line[4]] eco_code = line[5] source = line[6].split("|") update_date = line[8] evidence = line[10] negated = False if "NOT|" in predicate_label: negated = True predicate_label = predicate_label.replace("NOT|", "") relation_curie = CURIE_PREFIX_GO + ":" + predicate_label edge = kg2_util.make_edge(subject_curie, object_curie, relation_curie, predicate_label, GO_PROVIDED_BY_CURIE_ID, update_date) edge["negated"] = negated edge["publications"] = publications edges.append(edge) kg2_util.save_json({ "nodes": [], "edges": edges }, args.outputFile, args.test) print("Ending at", date())
def make_kg2_graph(input_file_name: str, test_mode: bool = False): nodes = [] edges = [] gene_ctr = 0 update_date = os.path.getmtime(input_file_name) ontology_curie_id = NCBI_KB_CURIE_ID ens_kp_node = kg2_util.make_node(ontology_curie_id, NCBI_KB_URL, 'NCBI Genes', kg2_util.BIOLINK_CATEGORY_DATA_FILE, update_date, ontology_curie_id) nodes.append(ens_kp_node) with open(input_file_name, 'r') as input_file: for line in input_file: if line.startswith('#'): continue gene_ctr += 1 if test_mode and gene_ctr > 10000: break fields = line.rstrip("\n").split("\t") fields = [(field if field.strip() != '-' else None) for field in fields] [ taxon_id_str, ncbi_gene_id, gene_symbol, locus_tag, synonyms_str, db_xrefs, chromosome, map_location, description, type_of_gene, symbol_auth, full_name_auth, nomenc_status, other_desig, modify_date, feature_type ] = fields taxon_id_int = int(taxon_id_str) if taxon_id_int != kg2_util.NCBI_TAXON_ID_HUMAN: # skip neanderthal- and denisovan-specific genes continue node_synonyms = list() if synonyms_str is not None: node_synonyms += synonyms_str.split('|') if other_desig is not None: node_synonyms += other_desig.split('|') if symbol_auth is not None and symbol_auth != gene_symbol: node_synonyms = [symbol_auth] + node_synonyms node_synonyms = list(set(node_synonyms)) full_name = full_name_auth if full_name is None: full_name = description if type_of_gene != "unknown" or (db_xrefs is None) or (not db_xrefs.startswith("MIM:")) or \ nomenc_status is not None: category_label = kg2_util.BIOLINK_CATEGORY_GENE else: full_name = 'Genetic locus associated with ' + full_name category_label = kg2_util.BIOLINK_CATEGORY_GENOMIC_ENTITY if full_name.startswith('microRNA'): category_label = kg2_util.BIOLINK_CATEGORY_MICRORNA node_dict = make_node(ncbi_gene_id, full_name, gene_symbol, modify_date, category_label, node_synonyms) node_curie_id = node_dict['id'] type_str = 'Type:' + type_of_gene node_description = '' if description is not None and description != full_name_auth: node_description = description + '; ' node_description += type_str if nomenc_status is not None: nomenc_tag = 'official' else: nomenc_tag = 'unofficial' if map_location is not None: node_description += '; Locus:' + map_location node_description += '; NameStatus:' + nomenc_tag node_dict['description'] = node_description nodes.append(node_dict) org_curie = kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + taxon_id_str predicate_label = 'in_taxon' edge_dict = kg2_util.make_edge_biolink(node_curie_id, org_curie, predicate_label, NCBI_KB_CURIE_ID, modify_date) edges.append(edge_dict) if db_xrefs is not None: xrefs_list = db_xrefs.split('|') for xref_curie in xrefs_list: if xref_curie.startswith('HGNC:HGNC:'): xref_curie = kg2_util.CURIE_PREFIX_HGNC + ':' + xref_curie.replace( 'HGNC:', '') elif xref_curie.startswith('Ensembl:'): xref_curie = xref_curie.upper() elif xref_curie.startswith('MIM:'): xref_curie = kg2_util.CURIE_PREFIX_OMIM + ':' + xref_curie.replace( 'MIM:', '') elif xref_curie.startswith('miRBase:'): xref_curie = kg2_util.CURIE_PREFIX_MIRBASE + ':' + xref_curie.replace( 'miRBase:', '') edges.append( kg2_util.make_edge(node_curie_id, xref_curie, kg2_util.CURIE_ID_OWL_SAME_AS, kg2_util.EDGE_LABEL_OWL_SAME_AS, NCBI_KB_CURIE_ID, modify_date)) return {'nodes': nodes, 'edges': edges}
def get_rels_dict(nodes: dict, owl_file_information_dict_list: list, uri_to_curie_shortener: callable, map_of_node_ontology_ids_to_curie_ids: dict): rels_dict = dict() for owl_file_information_dict in owl_file_information_dict_list: ontology = owl_file_information_dict['ontology'] ontology_id = owl_file_information_dict['id'] ont_graph = ontology.get_graph() ontology_curie_id = map_of_node_ontology_ids_to_curie_ids[ontology_id] for (object_id, subject_id, predicate_dict) in ont_graph.edges(data=True): assert type(predicate_dict) == dict ontology_node = nodes.get(ontology_curie_id, None) if ontology_node is not None: ontology_update_date = ontology_node['update date'] if subject_id == OWL_BASE_CLASS or object_id == OWL_BASE_CLASS: continue if subject_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER) or \ object_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER): continue # subject_id and object_id are IDs from the original ontology objects; these may not # always be the node curie IDs (e.g., for SNOMED terms). Need to map them subject_curie_id = map_of_node_ontology_ids_to_curie_ids.get( subject_id, None) if subject_curie_id is None: kg2_util.log_message( message="ontology node ID has no curie ID in the map", ontology_name=ontology.id, node_curie_id=subject_id, output_stream=sys.stderr) continue object_curie_id = map_of_node_ontology_ids_to_curie_ids.get( object_id, None) if object_curie_id is None: kg2_util.log_message( message="ontology node ID has no curie ID in the map", ontology_name=ontology.id, node_curie_id=object_id, output_stream=sys.stderr) continue predicate_label = None edge_pred_string = predicate_dict['pred'] if subject_curie_id.startswith( 'TUI:') and object_curie_id.startswith( 'TUI:') and edge_pred_string == 'subClassOf': continue if not edge_pred_string.startswith( 'http:') and not edge_pred_string.startswith('https'): # edge_pred_string is not a URI; this is the most common case if ':' not in edge_pred_string: # edge_pred_string is not a CURIE; this is the most common subcase if edge_pred_string != 'subClassOf': predicate_curie = 'owl:' + edge_pred_string else: predicate_curie = 'rdfs:subClassOf' predicate_label = kg2_util.convert_camel_case_to_snake_case( edge_pred_string) else: # edge_pred_string is a CURIE predicate_curie = edge_pred_string predicate_node = nodes.get(predicate_curie, None) if predicate_node is not None: predicate_label = predicate_node['name'] else: # predicate has no node object defined; just pull the label out of the CURIE if edge_pred_string.startswith('OBO:'): test_curie = edge_pred_string.replace('OBO:', '').replace( '_', ':') predicate_node = nodes.get(test_curie, None) if predicate_node is None: predicate_label = edge_pred_string.split( ':')[1].split('#')[-1] else: predicate_curie = test_curie else: predicate_label = edge_pred_string predicate_iri = prefixcommons.expand_uri(predicate_curie) predicate_curie_new = uri_to_curie_shortener(predicate_iri) if predicate_curie_new is not None: predicate_curie = predicate_curie_new else: predicate_iri = edge_pred_string predicate_curie = uri_to_curie_shortener(predicate_iri) if predicate_curie is None: kg2_util.log_message(message="predicate IRI has no CURIE: " + predicate_iri, ontology_name=ontology.id, output_stream=sys.stderr) continue if subject_curie_id == object_curie_id and predicate_label == 'xref': continue if predicate_curie == 'UMLS:hasSTY': subject_node = nodes[subject_curie_id] object_node = nodes[object_curie_id] subject_description = subject_node['description'] if subject_description is None: subject_description = '' subject_node['description'] = '; '.join( list( filter(None, [ subject_description, 'UMLS Semantic Type: ' + object_node['id'] ]))) continue rel_key = make_rel_key(subject_curie_id, predicate_curie, object_curie_id, ontology_curie_id) if predicate_label is None and ':' in predicate_curie: pred_node = nodes.get(predicate_curie, None) if pred_node is not None: predicate_label = pred_node['name'] if predicate_label[0].isupper(): predicate_label = predicate_label[0].lower( ) + predicate_label[1:] assert predicate_label is not None predicate_label = predicate_label.replace(' ', '_') # Only tested on Food and Efo ontologies predicate_label = kg2_util.convert_camel_case_to_snake_case( predicate_label) if rels_dict.get(rel_key, None) is None: edge = kg2_util.make_edge(subject_curie_id, object_curie_id, predicate_iri, predicate_curie, predicate_label, ontology_id, ontology_update_date) rels_dict[rel_key] = edge for node_id, node_dict in nodes.items(): xrefs = node_dict['xrefs'] if xrefs is not None: for xref_node_id in xrefs: if xref_node_id in nodes and node_id != xref_node_id: provided_by = nodes[node_id]['provided by'] key = make_rel_key(node_id, CURIE_OBO_XREF, xref_node_id, provided_by) if rels_dict.get(key, None) is None: edge = kg2_util.make_edge(node_id, xref_node_id, IRI_OBO_XREF, CURIE_OBO_XREF, 'xref', provided_by, ontology_update_date) rels_dict[key] = edge return rels_dict
def make_xref(subject: str, object: str, update_date: str): edge_dict = kg2_util.make_edge(subject, object, kg2_util.CURIE_ID_OWL_SAME_AS, kg2_util.EDGE_LABEL_OWL_SAME_AS, UNICHEM_KB_CURIE, update_date) return edge_dict
def make_kg2_graph(input_file_name: str, test_mode: bool = False): nodes = [] edges = [] line_ctr = 0 update_date = None with open(input_file_name, 'r') as input_file: for line in input_file: line = line.rstrip("\n") if line.startswith('#'): update_date = line.replace('#', '') continue if line.startswith('gene_name\t'): continue line_ctr += 1 if test_mode and line_ctr > 10000: break fields = line.split("\t") [gene_name, gene_claim_name, entrez_id, interaction_claim_source, interaction_types, drug_claim_name, drug_claim_primary_name, drug_name, drug_concept_id, _, #12.5.2020 new field in tsv: interaction group score PMIDs] = fields if entrez_id != "": object_curie_id = kg2_util.CURIE_PREFIX_NCBI_GENE + ':' + entrez_id if drug_concept_id != "": if "chembl" in drug_concept_id: _, chembl_id = drug_concept_id.split(":") subject_curie_id = kg2_util.CURIE_PREFIX_CHEMBL_COMPOUND + ':' + chembl_id else: print(f"DGIDB: Skipping row with drug concept id {drug_concept_id}", file=sys.stderr) continue #skipping over wikidata nodes, see #1185 else: if drug_claim_name != "": node_pubs_list = [] subject_curie_id = None if interaction_claim_source == INTERACTION_CLAIM_SOURCE_GTPI: subject_curie_id = GTPI_CURIE_PREFIX + ':' + drug_claim_name pmid_match = RE_PMID.match(drug_claim_primary_name) if pmid_match is not None: node_pubs_list = [pmid_match[2].replace(' ', '').strip()] node_name = pmid_match[1].strip() else: node_name = drug_claim_primary_name node_iri = GTPI_BASE_URL + drug_claim_name provided_by = GTPI_KB_CURIE elif interaction_claim_source == INTERACTION_CLAIM_SOURCE_TTD: subject_curie_id = TTD_CURIE_PREFIX + ':' + drug_claim_name node_name = drug_claim_primary_name node_iri = TTD_IRI_BASE + drug_claim_name provided_by = TTD_KB_CURIE if subject_curie_id is not None: node_dict = kg2_util.make_node(subject_curie_id, node_iri, node_name, kg2_util.BIOLINK_CATEGORY_CHEMICAL_SUBSTANCE, update_date, provided_by) node_dict['publications'] = node_pubs_list nodes.append(node_dict) if subject_curie_id is None: print("DGIDB: no controlled ID was provided for this drug: " + drug_claim_primary_name + "; source DB: " + interaction_claim_source, file=sys.stderr) continue if interaction_types == "": print("DGIDB: interaction type was empty. Setting to 'affects'.", file=sys.stderr) interaction_types = "affects" pmids_list = [] if PMIDs.strip() != "": pmids_list = [(kg2_util.CURIE_PREFIX_PMID + ':' + pmid.strip()) for pmid in PMIDs.split(',')] interaction_list = interaction_types.split(',') for interaction in interaction_list: interaction = interaction.replace(' ', '_') edge_dict = kg2_util.make_edge(subject_curie_id, object_curie_id, DGIDB_CURIE_PREFIX + ':' + interaction, interaction, DGIDB_KB_CURIE, update_date) edge_dict['publications'] = pmids_list edges.append(edge_dict) return {'nodes': nodes, 'edges': edges}
def make_kg2_graph(input_file_name: str, test_mode: bool = False): nodes = [] edges = [] gene_ctr = 0 with open(input_file_name, 'r') as input_file: for line in input_file: if line.startswith('#'): continue gene_ctr += 1 if test_mode and gene_ctr > 10000: break fields = line.rstrip("\n").split("\t") fields = [(field if field.strip() != '-' else None) for field in fields] [taxon_id_str, ncbi_gene_id, gene_symbol, locus_tag, synonyms_str, db_xrefs, chromosome, map_location, description, type_of_gene, symbol_auth, full_name_auth, nomenc_status, other_desig, modify_date, feature_type] = fields taxon_id_int = int(taxon_id_str) if taxon_id_int != kg2_util.NCBI_TAXON_ID_HUMAN: # skip neanderthal- and denisovan-specific genes continue node_synonyms = list() if synonyms_str is not None: node_synonyms += synonyms_str.split('|') if other_desig is not None: node_synonyms += other_desig.split('|') if symbol_auth is not None and symbol_auth != gene_symbol: node_synonyms = [symbol_auth] + node_synonyms node_synonyms = list(set(node_synonyms)) full_name = full_name_auth if full_name is None: full_name = description node_dict = make_node(ncbi_gene_id, full_name, gene_symbol, modify_date, node_synonyms) node_curie_id = node_dict['id'] type_str = 'Type:'+type_of_gene node_description = '' if description is not None and description != full_name_auth: node_description = description + '; ' node_description += type_str if map_location is not None: node_description += '; Locus:' + map_location if nomenc_status is not None: nomenc_tag = 'official' else: nomenc_tag = 'unofficial' node_description += '; NameStatus:' + nomenc_tag node_dict['description'] = node_description nodes.append(node_dict) org_curie = kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + taxon_id_str predicate_label = 'gene_found_in_organism' [relation, relation_curie] = kg2_util.predicate_label_to_iri_and_curie(predicate_label, NCBI_RELATION_CURIE_PREFIX, NCBI_BASE_IRI) edge_dict = kg2_util.make_edge(node_curie_id, org_curie, relation, relation_curie, predicate_label, NCBI_BASE_IRI, modify_date) edges.append(edge_dict) if db_xrefs is not None: xrefs_list = db_xrefs.split('|') for xref_curie in xrefs_list: if xref_curie.startswith('HGNC:HGNC:'): xref_curie = 'HGNC:' + xref_curie.replace('HGNC:', '') elif xref_curie.startswith('Ensembl:'): xref_curie = xref_curie.upper() elif xref_curie.startswith('MIM:'): xref_curie = 'OMIM:' + xref_curie.replace('MIM:', '') edges.append(kg2_util.make_edge(node_curie_id, xref_curie, kg2_util.IRI_OWL_SAME_AS, kg2_util.CURIE_OWL_SAME_AS, 'equivalent_to', NCBI_BASE_IRI, modify_date)) return {'nodes': nodes, 'edges': edges}
def make_kg2_graph(input_file_name: str, test_mode: bool = False): nodes = [] edges = [] line_ctr = 0 update_date = None with open(input_file_name, 'r') as input_file: for line in input_file: line = line.rstrip("\n") if line.startswith('#'): update_date = line.replace('#', '') continue if line.startswith('gene_name\t'): continue line_ctr += 1 if test_mode and line_ctr > 10000: break fields = line.split("\t") [gene_name, gene_claim_name, entrez_id, interaction_claim_source, interaction_types, drug_claim_name, drug_claim_primary_name, drug_name, drug_chembl_id, PMIDs] = fields if entrez_id != "": object_curie_id = 'NCBIGene:' + entrez_id if drug_chembl_id != "": subject_curie_id = 'CHEMBL.COMPOUND:' + drug_chembl_id else: if drug_claim_name != "": node_pubs_list = [] subject_curie_id = None if interaction_claim_source == "GuideToPharmacologyInteractions": subject_curie_id = GTPI_CURIE_PREFIX + ':' + drug_claim_name pmid_match = RE_PMID.match(drug_claim_primary_name) if pmid_match is not None: node_pubs_list = [pmid_match[2].replace(' ', '').strip()] node_name = pmid_match[1].strip() else: node_name = drug_claim_primary_name node_iri = GTPI_IRI_BASE + GTPI_LIGAND_SUFFIX + drug_claim_name provided_by = GTPI_IRI_BASE elif interaction_claim_source == "TTD": subject_curie_id = TTD_CURIE_PREFIX + ':' + drug_claim_name node_name = drug_claim_primary_name node_iri = TTD_IRI_BASE + drug_claim_name provided_by = TTD_IRI_BASE if subject_curie_id is not None: node_dict = kg2_util.make_node(subject_curie_id, node_iri, node_name, 'chemical_substance', update_date, provided_by) node_dict['publications'] = node_pubs_list nodes.append(node_dict) if subject_curie_id is None: print("DGIDB: no controlled ID was provided for this drug: " + drug_claim_primary_name + "; source DB: " + interaction_claim_source, file=sys.stderr) continue if interaction_types == "": interaction_types = "affects" pmids_list = [] if PMIDs.strip() != "": pmids_list = [('PMID:' + pmid.strip()) for pmid in PMIDs.split(',')] interaction_list = interaction_types.split(',') for interaction in interaction_list: interaction = interaction.replace(' ', '_') edge_dict = kg2_util.make_edge(subject_curie_id, object_curie_id, DGIDB_BASE_IRI + '/' + kg2_util.convert_snake_case_to_camel_case(interaction), DGIDB_CURIE_PREFIX + ':' + interaction, interaction, DGIDB_BASE_IRI, update_date) edge_dict['publications'] = pmids_list edges.append(edge_dict) return {'nodes': nodes, 'edges': edges}