def standardize_relationship(self,relationship): xref = relationship.identifier r = self.relations_by_xref[xref] if r is None: return LabeledID(identifier="GAMMA:0", label="Unmapped_Relation") else: return LabeledID(identifier=r.identifier, label=r.name)
def test_combined_gene_annotation(gene_annotator): # gene_annotator.annotate - these are coming from the cache after the first time gene_node = KNode('HGNC:9604', type=node_types.GENE) gene_node.add_synonyms( set([LabeledID(identifier='ENSEMBL:ENSG00000095303', label='PTGS1')])) gene_annotator.annotate(gene_node) # these are from ensembl assert gene_node.properties['ensembl_name'] == 'PTGS1' assert gene_node.properties['chromosome'] == '9' # these are from hgnc assert gene_node.properties['location'] == '9q33.2' gene_node = KNode('HGNC:13089', type=node_types.GENE) gene_node.add_synonyms( set([LabeledID(identifier='ENSEMBL:ENSG00000166526', label='ZNF3')])) gene_annotator.annotate(gene_node) # these are from ensembl assert gene_node.properties['ensembl_name'] == 'ZNF3' assert gene_node.properties['chromosome'] == '7' # these are from hgnc assert 'Zinc fingers C2H2-type' in gene_node.properties['gene_family'] assert 28 in gene_node.properties['gene_family_id'] gene_node = KNode('HGNC:122', type=node_types.GENE) gene_node.add_synonyms( set([LabeledID(identifier='ENSEMBL:ENSG00000143727', label='ACP1')])) gene_annotator.annotate(gene_node) # these are from ensembl assert gene_node.properties['ensembl_name'] == 'ACP1' assert gene_node.properties['chromosome'] == '2' # these are from hgnc assert 1071 in gene_node.properties['gene_family_id']
def __init__(self, context): super(Ensembl, self).__init__("ensembl", context) self.clingen = context.core.clingen self.cache = context.cache self.redis = context.cache.redis self.var_to_gene_predicate = LabeledID(identifier=f'GAMMA:0000102', label=f'nearby_variant_of') self.var_to_var_predicate = LabeledID(identifier=f'NCIT:C16798', label=f'linked_to') self.gene_db_successfully_created = False self.gene_db_path = os.path.join(os.path.dirname(__file__), 'genes.sqlite3') self.persistent_conn = None self.all_gene_annotations = None # we assume the order of attributes from this url - # if we change this we need to change the indexing in create_genes_db below self.ensembl_genes_url = """http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE Query> <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" > <Dataset name = "hsapiens_gene_ensembl" interface = "default" > <Attribute name = "ensembl_gene_id" /> <Attribute name = "gene_biotype" /> <Attribute name = "external_gene_name" /> <Attribute name = "start_position" /> <Attribute name = "end_position" /> <Attribute name = "description" /> <Attribute name = "chromosome_name" /> </Dataset> </Query>""" self.gene_batch_url = 'http://www.ensembl.org/biomart/martservice' self.check_if_already_done_sql = "SELECT name FROM sqlite_master WHERE type='table' AND name='genes';" self.genes_table_sql = """CREATE TABLE IF NOT EXISTS genes ( id INTEGER PRIMARY KEY AUTOINCREMENT, ensembl_id text, gene_name text, chromosome INTEGER, start_pos INTEGER, end_pos INTEGER, gene_type text, description text);""" self.genes_table_ensembl_id_index_sql = "CREATE UNIQUE INDEX ensembl_ids on genes(ensembl_id);" self.genes_table_composite_index_sql = "CREATE INDEX gene_composite on genes(chromosome, start_pos, end_pos, ensembl_id);" self.gene_entry_sql = """INSERT INTO genes (ensembl_id, gene_name, chromosome, start_pos, end_pos, gene_type, description) VALUES (?,?,?,?,?,?,?);""" self.gene_range_select_sql = """SELECT ensembl_id, start_pos, end_pos FROM genes WHERE chromosome = ? AND ((? >= start_pos AND ? <= end_pos) OR (? >= start_pos AND ? <= end_pos) OR (? <= start_pos AND ? >= end_pos));""" self.gene_ensembl_id_select_sql = "SELECT * FROM genes WHERE ensembl_id = ?"
def normalize(self, node): """Given a node, which will have many potential identifiers, choose the best identifier to be the node ID, where 'best' is defined by the order in which identifiers appear in the id prefix configurations within the concept model.""" #If we have two synonyms with the same id, but one has no label, chuck it smap = defaultdict(list) for labeledid in node.synonyms: smap[labeledid.identifier].append(labeledid.label) for lid, labels in smap.items(): if len(labels) > 1 and (None in labels): node.synonyms.remove(LabeledID(identifier=lid, label=None)) if len(labels) > 1 and ('' in labels): node.synonyms.remove(LabeledID(identifier=lid, label='')) #Now find the bset one for an id type_curies = self.concepts.get(node.type).id_prefixes #Now start looking for the best curies synonyms_by_curie = defaultdict(list) for s in node.synonyms: c = Text.get_curie(s.identifier) synonyms_by_curie[c].append(s) for type_curie in type_curies: potential_identifiers = synonyms_by_curie[type_curie] if len(potential_identifiers) > 0: if len(potential_identifiers) > 1: pis = [ f'{pi.identifier}({pi.label})' for pi in potential_identifiers ] ids_with_labels = list( filter(lambda x: x.label is not None, potential_identifiers)) if len(ids_with_labels) > 0: potential_identifiers = ids_with_labels potential_identifiers.sort() node.id = potential_identifiers[0].identifier #Only replace the label if we have a label. if potential_identifiers[0].label != '': node.name = potential_identifiers[0].label break #Remove any synonyms with extraneous prefixes. The point of this is not so much to remove # unknown prefixes, as to make sure that if we got e.g. a meddra, and we downcast it to a disease, # that we don't end up with HP's in the equivalent ids. bad_synonyms = set() for synonym in node.synonyms: if isinstance(synonym, LabeledID): prefix = Text.get_curie(synonym.identifier) else: prefix = Text.get_curie(synonym) if prefix not in type_curies: bad_synonyms.add(synonym) for bs in bad_synonyms: node.synonyms.remove(bs) if node.id.startswith('DOID'): logger.warn("We are ending up with a DOID here") logger.warn(node.id) logger.warn(node.synonyms) logger.warn(node.type)
def standardize_predicate(self, predicate, sourcenode=None, targetnode=None): """CTD has a little more work to do than the standard service.""" if '|' not in predicate.label: return self.concept_model.standardize_relationship(predicate) parts = predicate.label.split('|') goodparts = list(filter(lambda p:'reaction' not in p and 'cotreatment' not in p, parts)) if len(goodparts) != 1: return self.concept_model.standardize_relationship(LabeledID(identifier='CTD:interacts_with', label='interacts_with')) #Change the modifier to "affects" to deal with the fact that we don't know what the deleted part does. thing = self.term_parents[goodparts[0].split('^')[1]] new_id = f'CTD:affects^{thing}' return self.normalize_predicate(LabeledID(identifier=new_id, label=new_id))
def get_drug_from_adverse_events(self, input_node): """Given a node (drug or phenotype), find chemicals that have a high or low rate of causing the node concept as an adverse event""" meddras = input_node.get_labeled_ids_by_prefix('MEDDRA') return_results = [] for meddra in meddras: mname = meddra.label murl = f'{self.url}query?q=aeolus.outcomes.name:{mname}' hits = self.page_calls(murl, 100) for hit in hits: #import json #print(json.dumps(hit,indent=4)) if 'aeolus' in hit: aeolus = hit['aeolus'] for outcome in aeolus['outcomes']: #I think it makes sense to do some filtering here. I don't want anything unless the lower # CI bound is > 1, and if I have enough counts (at least 5) if (outcome['name'] != mname): continue print(outcome['name'], outcome['case_count'], outcome['prr_95_ci']) if outcome['case_count'] > 5 and min( outcome['prr_95_ci']) > 1: predicate = LabeledID( identifier="RO:0003302", label="causes_or_contributes_to") elif outcome['case_count'] > 5 and max( outcome['prr_95_ci']) < 1: predicate = LabeledID(identifier="RO:0002599", label="prevents") else: continue drug_node = self.make_drug_node(hit) if drug_node is None: continue #obj_node = KNode(meddra_id, type=node_types.DISEASE_OR_PHENOTYPIC_FEATURE, name=outcome['name']) props = { 'prr': outcome['prr'], 'ror': outcome['ror'], 'case_count': outcome['case_count'] } edge = self.create_edge(drug_node, input_node, 'mychem.get_adverse_events', mname, predicate, url=murl, properties=props) return_results.append((edge, drug_node)) return return_results
def anatomy_to_gene (self, anat): anat_identifiers = list(anat.get_synonyms_by_prefix('UBERON')) anat_identifier = anat_identifiers[0] nodes,edges = self.query ( "MATCH (a:Anatomy)-[ar]-(g:Gene) WHERE a.identifier='{0}' RETURN a, ar, g ".format (anat_identifier), labels=['Gene'], kinds=['node','relationship']) node_ids = [ LabeledID(identifier=f"NCBIGENE:{node['identifier']}", label=node['name']) for node in nodes ] edge_ids = [ edge['type'] for edge in edges ] results = [] for node_id, predicate_label in zip(node_ids,edge_ids): predicate = LabeledID(identifier=f'hetio:{predicate_label}', label=predicate_label) gene = KNode(node_id.identifier, type=node_types.GENE, name=node_id.label) #These edges all go from anatomy to gene edge = self.create_edge(anat, gene,'hetio.anatomy_to_gene',anat_identifier,predicate) results.append((edge, gene)) return results
def gene_to_anatomy (self, gene): gene_identifiers = list(gene.get_synonyms_by_prefix('NCBIGENE')) gene_identifier = Text.un_curie(gene_identifiers[0]) nodes,edges = self.query ( "MATCH (a:Anatomy)-[ar]-(g:Gene) WHERE g.identifier={0} RETURN a, ar, g LIMIT 200".format (gene_identifier), labels=['Anatomy'], kinds=['node','relationship']) node_ids = [ LabeledID(identifier=node['identifier'], label=node['name']) for node in nodes ] edge_ids = [ edge['type'] for edge in edges ] results = [] for node_id, predicate_label in zip(node_ids,edge_ids): predicate = LabeledID(identifier=f'hetio:{predicate_label}', label=predicate_label) anatomy = KNode(node_id.identifier, type=node_types.ANATOMY, name=node_id.label) #These edges all go from anatomy to gene edge = self.create_edge(anatomy, gene,'hetio.gene_to_anatomy',gene_identifier,predicate) results.append((edge, anatomy)) return results
def drug_get_gene(self, subject): """ Get a gene from a drug. """ resolved_edge_nodes = [] identifiers = subject.get_synonyms_by_prefix('CHEMBL.COMPOUND') for s in identifiers: pharosid = Text.un_curie(s) original_edge_nodes = [] url = 'https://pharos.nih.gov/idg/api/v1/ligands(%s)?view=full' % pharosid r = requests.get(url) try: result = r.json() except: #Pharos returns a 404 if it doesn't recognize the identifier, which ends up producing # errors in turning into json. Skip to next identifier continue actions = set() # for testing predicate = LabeledID(identifier='PHAROS:drug_targets', label='is_target') for link in result['links']: if link['kind'] == 'ix.idg.models.Target': pharos_target_id = int(link['refid']) hgnc = self.target_to_hgnc(pharos_target_id) if hgnc is not None: hgnc_node = KNode(hgnc, type=node_types.GENE) edge = self.create_edge(subject,hgnc_node,'pharos.drug_get_gene',pharosid,predicate,url=url) resolved_edge_nodes.append((edge, hgnc_node)) else: logging.getLogger('application').warn('Did not get HGNC for pharosID %d' % pharos_target_id) return resolved_edge_nodes
def gene_get_drug(self, gene_node): """ Get a drug from a gene. """ resolved_edge_nodes = [] identifiers = gene_node.get_synonyms_by_prefix('UNIPROTKB') for s in identifiers: try: logger.debug(f'Call with {s}') pharosid = Text.un_curie(s) original_edge_nodes = [] url = 'https://pharos.nih.gov/idg/api/v1/targets(%s)?view=full' % pharosid r = requests.get(url) try: result = r.json() logger.debug('back') except: #If pharos doesn't know the identifier, it just 404s. move to the next logger.debug('404') continue actions = set() # for testing predicate = LabeledID(identifier='PHAROS:drug_targets', label='is_target') chembl_id = None for link in result['links']: if link['kind'] == 'ix.idg.models.Ligand': pharos_drug_id = link['refid'] chembl_id, label = self.drugid_to_identifiers(pharos_drug_id) if chembl_id is not None: drug_node = KNode(chembl_id, type=node_types.CHEMICAL_SUBSTANCE, name=label) edge = self.create_edge(drug_node,gene_node, 'pharos.gene_get_drug', pharosid,predicate, url=url) resolved_edge_nodes.append( (edge,drug_node) ) except: logger.debug("Error encountered calling pharos with",s) logger.debug('ok') return resolved_edge_nodes
def graph_drugbank_to_uniprot(self, drugbank): response = self.triplestore.query_template(inputs={ "drugID": "DB{0}".format(Text.un_curie(drugbank.identifier)) }, outputs=["uniprotGeneID"], template_text=""" prefix drugbank: <http://chem2bio2rdf.org/drugbank/resource/> prefix drugbank_drug: <http://chem2bio2rdf.org/drugbank/resource/drugbank_drug/> prefix ctd: <http://chem2bio2rdf.org/ctd/resource/> select distinct ?uniprotGeneID where { values ( ?drugID ) { ( drugbank_drug:${drugID} ) } ?dbInter drugbank:GeneBank_ID ?geneBankID ; drugbank:gene ?uniprotGeneID . ?drugID drugbank:CID ?pubchemCID ; drugbank:Generic_Name ?drugGenericName . ?ctd_disease ctd:diseaseid ?diseaseID ; ctd:cid ?pubchemCID . }""") predicate = LabeledID(identifier='SIO:001257', label='chemical to gene association') results = [] for r in response: node = KNode("UNIPROT:{0}".format( r['uniprotGeneID'].split('/')[-1:][0]), type=node_types.GENE) edge = self.create_edge(drugbank, node, 'chembio.graph_drugbank_to_uniprot', predicate, drugbank.id) results.append(edge, node) return results
def get_edges_from_file(self, file_name, provided_by, delimiter): """ All is stuff is till we get kgx to merge edges. For now creating a pattern looking like a robokopservice and let writer handle it. :param file_name: :return: """ if not file_name: return bl_resolver = BL_lookup() with open(file_name) as edge_file: reader = csv.DictReader(edge_file, delimiter=delimiter) for raw_edge in reader: edge_label = raw_edge['edge_label'].split(':')[-1] relation_predicate = raw_edge['relation'] predicate = LabeledID( identifier= relation_predicate, #bl_resolver.resolve_curie(edge_label), label=edge_label) source_node = KNode(raw_edge['subject']) target_node = KNode(raw_edge['object']) edge = self.create_edge( source_node=source_node, target_node=target_node, input_id=source_node.id, provided_by=provided_by, predicate=predicate, ) edge.standard_predicate = predicate yield edge
def disease_get_gene(self, disease_node): """ Get a gene from a pharos disease id.""" resolved_edge_nodes = [] hgncs = set() # WD:P2293 gene assoc with condition. # domain is gene and range is disease or phenotype for this relationship predicate = LabeledID(identifier='WD:P2293', label='gene_involved') #Pharos contains multiple kinds of disease identifiers in its disease table: # For OMIM identifiers, they can have either prefix OMIM or MIM # UMLS doen't have any prefixes.... :( pharos_predicates = {'DOID':('DOID',),'UMLS':(None,),'MESH':('MESH',),'OMIM':('OMIM','MIM'),'ORPHANET':('Orphanet',)} for ppred,dbpreds in pharos_predicates.items(): pharos_candidates = [Text.un_curie(x) for x in disease_node.get_synonyms_by_prefix(ppred)] for dbpred in dbpreds: if dbpred is None: pharos_ids = pharos_candidates else: pharos_ids = [f'{dbpred}:{x}' for x in pharos_candidates] for pharos_id in pharos_ids: cursor = self.db.cursor(dictionary = True, buffered = True) query = f"select distinct x.value, p.sym from disease d join xref x on x.protein_id = d.target_id join protein p on d.target_id = p.id where x.xtype = 'HGNC' and d.dtype <> 'Expression Atlas' and d.did='{pharos_id}';" cursor.execute(query) for result in cursor: label = result['sym'] hgnc = result['value'] if hgnc not in hgncs: hgncs.add(hgnc) gene_node = KNode(hgnc, type=node_types.GENE, name=label) edge = self.create_edge(gene_node, disease_node, 'pharos.disease_get_gene', pharos_id, predicate) resolved_edge_nodes.append((edge, gene_node)) return resolved_edge_nodes
def disease_to_phenotype (self, disease): disease_identifiers = list(disease.get_synonyms_by_prefix('DOID')) if len(disease_identifiers) == 0: return [] disease_identifier = disease_identifiers[0] query = """MATCH (d:Disease{identifier:'%s'})-[r]-(s:Symptom) RETURN d,r,s""" % (disease_identifier) nodes,edges = self.query (query, labels=['Symptom'], kinds=['node','relationship']) node_ids = [ LabeledID(identifier=f"MESH:{node['identifier']}", label=node['name']) for node in nodes ] edge_ids = [ edge['type'] for edge in edges ] results = [] for node_id, predicate_label in zip(node_ids,edge_ids): predicate = LabeledID(identifier=f'hetio:{predicate_label}', label=predicate_label) phenotype = KNode(node_id.identifier, type=node_types.PHENOTYPE, name=node_id.label) edge = self.create_edge(disease, phenotype, 'hetio.disease_to_phenotype', disease_identifier, predicate) results.append( (edge, phenotype) ) return results
def term_get_ancestors(self, node_type, root_iri): results = self.triplestore.query_template( template_text=self.query, inputs={'root_uri': root_iri}, outputs=['parent_id', 'parent_label', 'child_id', 'child_label']) print('found total ', len(results), ' results.') nodes = set() edges = set() for index, row in enumerate(results): # Output type would be same as input type? ancestor_node = KNode(Text.obo_to_curie(row['parent_id']), name=row['parent_label'], type=node_type) child_node = KNode(Text.obo_to_curie(row['child_id']), name=row['child_label'], type=node_type) if ancestor_node.id == child_node.id: # refrain from adding edge to the node itself continue predicate = LabeledID(identifier='rdfs:subClassOf', label='subclass of') edge = self.create_edge( source_node=child_node, target_node=ancestor_node, predicate=predicate, provided_by='uberongraph.term_get_ancestors', input_id=child_node.id) nodes.add(child_node) nodes.add(ancestor_node) edges.add(edge) return nodes, edges
def gene_to_drug_expanded(self, gene_node): output = [] identifiers = gene_node.get_synonyms_by_prefix('NCBIGENE') for identifier in identifiers: unique = set() geneid = Text.un_curie(identifier) url = f"{self.url}CTD_chem_gene_expanded_geneID/ncbigene:{geneid}/" obj = requests.get (url).json () for r in obj: good_row, predicate_label, props, pmids = self.check_expanded_gene_chemical_row(r) if not good_row: continue predicate = self.normalize_predicate( LabeledID(identifier=f"CTD:{Text.snakify(predicate_label)}", label=predicate_label) ) #Should this be substance? drug_node = KNode(Text.upper_curie(r['chemicalID']), type=node_types.CHEMICAL_SUBSTANCE, name=r['chem_label']) direction = r['direction'] if direction == '->': subject = drug_node object = gene_node else: subject = gene_node object = drug_node edge = self.create_edge(subject,object,'ctd.gene_to_drug_expanded',identifier,predicate,properties = props,url=url,publications=pmids) #This is what we'd like it to be, but right now there's not enough real specificity on the predicates #key = (drug_node.id, edge.standard_predicate.label) key = (drug_node.id, edge.original_predicate.label) if key not in unique: output.append( (edge,drug_node) ) unique.add(key) return output
def disease_to_exposure(self, disease_node): logger.info("disease-to-exposure") output = [] identifiers = disease_node.get_synonyms_by_prefix('MESH') for identifier in identifiers: unique = set() url = f"{self.url}CTD_exposure_events_diseaseid/{Text.un_curie(identifier)}/" obj = requests.get (url).json () logger.info(url) logger.info(len(obj)) for r in obj: predicate_label = r['outcomerelationship'] if predicate_label == 'no correlation': continue predicate = self.normalize_predicate( LabeledID(identifier=f"CTD:{''.join(predicate_label.split())}", label=predicate_label) ) #Should this be substance? drug_node = KNode(f"MESH:{r['exposurestressorid']}", type=node_types.CHEMICAL_SUBSTANCE, name=r['exposurestressorname']) edge = self.create_edge(drug_node,disease_node,'ctd.disease_to_exposure',identifier,predicate, publications=[f"PMID:{r['reference']}"],url=url) key = (drug_node.id, edge.standard_predicate) if key not in unique: output.append( (edge,drug_node) ) unique.add(key) return output
def gene_to_drug(self, gene_node): output = [] identifiers = gene_node.get_synonyms_by_prefix('NCBIGENE') for identifier in identifiers: unique = set() geneid = Text.un_curie(identifier) url = f"{self.url}/CTD_chem_gene_ixns_GeneID/{geneid}/" obj = requests.get (url).json () for r in obj: if r['GeneID'] != geneid: continue good_row, predicate_label, props = self.check_gene_chemical_row(r) if not good_row: continue predicate = self.normalize_predicate( LabeledID(identifier=f'CTD:{predicate_label}', label=predicate_label) ) #Should this be substance? drug_node = KNode(f"MESH:{r['ChemicalID']}", type=node_types.CHEMICAL_SUBSTANCE, name=f"{r['ChemicalName']}") if sum([s in predicate.identifier for s in self.g2d_strings]) > 0: subject = gene_node obj = drug_node else: subject = drug_node obj = gene_node edge = self.create_edge(subject,obj,'ctd.gene_to_drug',identifier,predicate, publications=[f"PMID:{x}" for x in r['PubMedIDs'].split('|') ],url=url,properties=props) #This is what we'd like it to be, but right now there's not enough real specificity on the predicates #key = (drug_node.id, edge.standard_predicate.label) key = (drug_node.id, edge.original_predicate.label) if key not in unique: output.append( (edge,drug_node) ) unique.add(key) return output
def drug_to_gene_expanded(self, drug): output = [] identifiers = drug.get_synonyms_by_prefix('MESH') for identifier in identifiers: url=f"{self.url}CTD_chem_gene_expanded_chemicalID/mesh:{Text.un_curie(identifier)}/" result = requests.get(url) obj=result.json() for r in obj: good_row, predicate_label, props, pmids = self.check_expanded_gene_chemical_row(r) if not good_row: continue predicate = self.normalize_predicate( LabeledID(identifier=f"CTD:{Text.snakify(predicate_label)}", label=predicate_label) ) gene_node = KNode(Text.upper_curie(r['geneID']), name=r['gene_label'],type=node_types.GENE) direction = r['direction'] if direction == '->': subject = drug object = gene_node else: subject = gene_node object = drug edge = self.create_edge(subject,object,'ctd.drug_to_gene_expanded',identifier,predicate,publications=pmids,properties=props,url=url ) output.append( (edge,gene_node) ) return output
def get_pathway_by_gene_family(self, gene_family_node): """ """ results = [] predicate = LabeledID('BFO:0000054', 'related_to') rows = self.get_rows_using_curie(gene_family_node.id) for gene_family_data in rows: pathway_data = [x for x in gene_family_data['pathway'] if x != ''] #parse out the data for row in pathway_data: pathway_data_raw, component_data_raw = self.split_with( row, splitter='>') pathway_data_split = self.split_with( pathway_data_raw, splitter='#', keys=['pathway_name', 'pathway_access']) # component_data = self.split_with(component_data_raw, splitter= '#', ['component_name', 'component_access']) pathway_node = KNode( f"PANTHER.PATHWAY:{pathway_data_split['pathway_access']}", type=node_types.PATHWAY, name=pathway_data_split['pathway_name']) edge = self.create_edge(gene_family_node, pathway_node, 'panther.get_pathway_by_gene_family', gene_family_node.id, predicate) results.append((edge, pathway_node)) return results
def disease_get_gene(self, subject): """ Get a gene from a pharos disease id. """ pharos_ids = subject.get_synonyms_by_prefix('DOID') resolved_edge_nodes = [] for pharosid in pharos_ids: logging.getLogger('application').debug("Identifier:" + subject.id) original_edge_nodes = [] url='https://pharos.nih.gov/idg/api/v1/diseases/%s?view=full' % pharosid logger.info(url) r = requests.get(url) result = r.json() predicate=LabeledID(identifier='PHAROS:gene_involved', label='gene_involved') for link in result['links']: if link['kind'] == 'ix.idg.models.Target': pharos_target_id = int(link['refid']) logger.info(f"Pharos ID: {pharos_target_id}") hgnc = self.target_to_hgnc(pharos_target_id) if hgnc is not None: hgnc_node = KNode(hgnc, type=node_types.GENE) edge = self.create_edge(subject,hgnc_node,'pharos.disease_get_gene',pharosid,predicate,url=url) resolved_edge_nodes.append((edge, hgnc_node)) logger.info(f" HGNC ID: {hgnc}") else: logging.getLogger('application').warn('Did not get HGNC for pharosID %d' % pharos_target_id) return resolved_edge_nodes
def disease_or_phenotypic_feature_to_sequence_variant( self, phenotype_node): ## TODO this could support Orphanet etc return_results = [] trait_ids = phenotype_node.get_synonyms_by_prefix('EFO') trait_prefix = 'EFO' if not trait_ids: trait_ids = phenotype_node.get_synonyms_by_prefix('ORPHANET') trait_prefix = 'Orphanet' if not trait_ids: trait_ids = phenotype_node.get_synonyms_by_prefix('HP') trait_prefix = 'HP' for trait_id in trait_ids: query_url = f'{self.url}efoTraits/{trait_prefix}_{Text.un_curie(trait_id)}/associations?projection=associationByEfoTrait' query_json = self.query_service(query_url) if query_json: try: for association in query_json['_embedded']['associations']: variant_nodes = [] for snp in association['snps']: variant_rsid = snp['rsId'] variant_nodes.append( KNode(f'DBSNP:{variant_rsid}', name=f'{variant_rsid}', type=node_types.SEQUENCE_VARIANT)) if variant_nodes: props = {} try: props['pvalue'] = float(association['pvalue']) except ValueError: pass pubs = [] association_id = association['_links']['self'][ 'href'].rsplit('/', 1)[1] pubmed_id = self.get_pubmed_id_by_association( association_id) if pubmed_id: pubs.append(f'PMID:{pubmed_id}') predicate = LabeledID(identifier=f'RO:0002609', label=f'related_to') for new_node in variant_nodes: edge = self.create_edge( phenotype_node, new_node, 'gwascatalog.disease_or_phenotypic_feature_to_sequence_variant', phenotype_node.id, predicate, url=query_url, properties=props, publications=pubs) return_results.append((edge, new_node)) except (KeyError, IndexError) as e: logger.warning( f'problem parsing results from GWASCatalog: {e}') return return_results
def graph_get_pathways_by_gene(self, gene): #reasoner response = self.triplestore.query_template( inputs={"gene": gene.id.split(':')[1].upper()}, outputs=['keggPath'], template_text=""" prefix kegg: <http://chem2bio2rdf.org/kegg/resource/> prefix drugbank: <http://chem2bio2rdf.org/drugbank/resource/> prefix uniprot: <http://chem2bio2rdf.org/uniprot/resource/gene/> prefix ctd: <http://chem2bio2rdf.org/ctd/resource/> prefix mesh: <http://bio2rdf.org/mesh:> select ?drugGenericName ?uniprotGeneID ?pathwayName ?keggPath where { ?keggPath kegg:protein ?swissProtID ; kegg:Pathway_name ?pathwayName . ?keggInter kegg:cid ?pubchemCID . ?dbInter drugbank:GeneBank_ID ?geneBankID ; drugbank:SwissProt_ID ?swissProtID ; drugbank:gene ?uniprotGeneID . ?drugID drugbank:CID ?pubchemCID ; drugbank:Generic_Name ?drugGenericName . ?ctd_disease ctd:diseaseid ?diseaseID ; ctd:cid ?pubchemCID . values ( ?uniprotGeneID ) { ( uniprot:$gene ) } } LIMIT 2000""") results = [] predicate = LabeledID(identifier='RO:0000056', label='participates_in') for r in response: node = KNode("KEGG:{0}".format(r['keggPath'].split('/')[-1:][0]), type=node_types.PATHWAY) edge = self.create_edge(gene, node, 'chembio.graph_get_pathways_by_gene', gene.id, predicate) results.append((edge, node)) return results
def get_biological_process_or_activity_by_gene_family( self, gene_family_node): """ Creates Biological process/activity nodes associated with a gene family. """ results = [] # @TODO make sensible edge here predicate = LabeledID('RO:0000056', 'participates_in') rows = self.get_rows_using_curie(gene_family_node.id) for row in rows: bio_process_or_activity_data = [ x for x in row['panther_molecular_func'] if x != '' ] + [x for x in row['panther_biological_process'] if x != ''] for bp in bio_process_or_activity_data: label, id = bp.split('#') bio_process_or_activity_node = KNode( id, type=node_types.BIOLOGICAL_PROCESS_OR_ACTIVITY, name=label) edge = self.create_edge( gene_family_node, bio_process_or_activity_node, 'panther.get_biological_process_or_activity_by_gene_family', gene_family_node.id, predicate) results.append((edge, bio_process_or_activity_node)) return results
def process_variant_to_gene_relationships(self, variant_nodes: list, writer: WriterDelegator): all_results = self.genetics_services.get_variant_to_gene(self.crawl_for_service, variant_nodes) for source_node_id, results in all_results.items(): # convert the simple edges and nodes to rags objects and write them to the graph for (edge, node) in results: gene_node = KNode(node.id, type=node.type, name=node.name, properties=node.properties) if self.recreate_sv_node: variant_node = KNode(source_node_id, type= node_types.SEQUENCE_VARIANT) variant_node.add_export_labels([node_types.SEQUENCE_VARIANT]) writer.write_node(variant_node) if gene_node.id not in self.written_genes: writer.write_node(gene_node) self.written_genes.add(gene_node.id) predicate = LabeledID(identifier=edge.predicate_id, label=edge.predicate_label) gene_edge = KEdge(source_id=source_node_id, target_id=gene_node.id, provided_by=edge.provided_by, ctime=edge.ctime, original_predicate=predicate, # standard_predicate=predicate, input_id=edge.input_id, properties=edge.properties) writer.write_edge(gene_edge) logger.info(f'added {len(results)} variant relationships for {source_node_id}')
def graph_diseaseid_to_uniprot(self, drugbank): print(drugbank.id.lower()) response = self.triplestore.query_template( inputs={"diseaseID": drugbank.id.lower()}, outputs=["uniprotGeneID"], template_text=""" prefix drugbank: <http://chem2bio2rdf.org/drugbank/resource/> prefix drugbank_drug: <http://chem2bio2rdf.org/drugbank/resource/drugbank_drug/> prefix ctd: <http://chem2bio2rdf.org/ctd/resource/> prefix mesh.disease: <http://bio2rdf.org/mesh:> select distinct ?uniprotGeneID where { values ( ?diseaseID ) { ( $diseaseID ) } ?dbInter drugbank:gene ?uniprotGeneID . ?drugID drugbank:CID ?pubchemCID. ?ctd_disease ctd:diseaseid ?diseaseID ; ctd:cid ?pubchemCID . }""") predicate = LabeledID(identifier='NCIT:R176', label='disease to gene association') results = [] for r in response: node = KNode("UNIPROT:{0}".format( r['uniprotGeneID'].split('/')[-1:][0]), type=node_types.GENE) edge = self.create_edge(drugbank, node, 'chembio.graph_diseaseid_to_uniprot', drugbank.id, predicate) results.append((edge, node)) return results
def parse_edges(self, provided_by, limit=0): """ Construct KEdges""" if not provided_by: raise RuntimeError( 'Error edge property provided by is not specified') limit_counter = 0 with open(os.path.join(self.cord_dir, 'edges.txt')) as edges_file: reader = csv.DictReader(edges_file, delimiter='\t') for edge_raw in reader: predicate = LabeledID(identifier='SEMMEDDB:ASSOCIATED_WITH', label='related_to') source_node = KNode(edge_raw['Term1']) target_node = KNode(edge_raw['Term2']) edge = self.create_edge(source_node=source_node, target_node=target_node, input_id=edge_raw['Term1'], provided_by=provided_by, predicate=predicate, publications=[], properties={ 'num_publications': float(edge_raw['Effective_Pubs']), 'enrichment_p': float(edge_raw['Enrichment_p']) }) edge.standard_predicate = predicate limit_counter += 1 if limit and limit_counter > limit: break yield limit_counter - 1, edge
def get_gene_family_by_gene_family(self, gene_family_node): """ Create Gene family nodes given a gene family. """ response = [] fam_id, sub_fam_id = self.get_family_sub_family_ids_from_curie( gene_family_node.id) predicate = LabeledID('BFO:0000050', 'part of') if sub_fam_id == None: # we are looking for subfamilies sub_id_keys = [ y for y in self.gene_family_data[fam_id] if y != 'family_name' ] for sub_id in sub_id_keys: panther_id = f'{fam_id}:{sub_id}' # logger.debug(f'GENE _ FAMILY DATA: { self.gene_family_data[fam_id]}') sub_family_node = self.__create_gene_family_node( panther_id, self.gene_family_data[fam_id][sub_id]['sub_family_name']) edge = self.create_edge( sub_family_node, gene_family_node, 'panther.get_gene_family_by_gene_family', sub_family_node.id, predicate) response.append((edge, sub_family_node)) return response # else we are a sub family family_node = self.__create_gene_family_node( fam_id, self.gene_family_data[fam_id]['family_name']) edge = self.create_edge(gene_family_node, family_node, 'panther.get_gene_family_by_gene_family', gene_family_node.id, predicate) return [(edge, family_node)]
def gene_to_tissues(self, drug): output = [] identifiers = drug.get_synonyms_by_prefix('NCBIGENE') for identifier in identifiers: url = f"{self.url}/RNAseqDB_bicluster_gene_to_tissue_gene/ncbigene:{Text.un_curie(identifier)}/" obj = requests.get(url).json() for r in obj: anatomy_id = r['col_enrich_UBERON'] if anatomy_id == '': continue predicate = LabeledID(identifier='RO:0002610', label='correlated with') anat_node = KNode(anatomy_id, type=node_types.ANATOMICAL_ENTITY) if sum([s in predicate.identifier for s in self.g2d_strings]) > 0: subject = gene_node object = drug else: subject = drug object = gene_node edge = self.create_edge( subject, object, 'ctd.drug_to_gene', identifier, predicate, publications=[f"PMID:{r['PubMedIDs']}"], url=url, properties=props) output.append((edge, gene_node)) return output
def create_phenotype_to_variant_components(self, query_url, phenotype_node, variant_id, variant_label, pubmed_id=None, properties={}): variant_node = KNode(variant_id, name=variant_label, type=node_types.SEQUENCE_VARIANT) pubs = [] if pubmed_id: pubs.append(f'PMID:{pubmed_id}') predicate = LabeledID(identifier=f'RO:0002609', label=f'related_to') edge = self.create_edge( phenotype_node, variant_node, 'gwascatalog.disease_or_phenotypic_feature_to_sequence_variant', phenotype_node.id, predicate, url=query_url, properties=properties, publications=pubs) return (edge, variant_node)