def get_anatomy_parts(self, anatomy_identifier): """Given an UBERON id, find other UBERONS that are parts of the query""" if anatomy_identifier.startswith('http'): anatomy_identifier = Text.obo_to_curie(anatomy_identifier) text = """ prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> prefix UBERON: <http://purl.obolibrary.org/obo/UBERON_> prefix BFO: <http://purl.obolibrary.org/obo/BFO_> select distinct ?part ?partlabel from <http://reasoner.renci.org/nonredundant> from <http://example.org/uberon-hp-cl.ttl> where { $anatomy_id BFO:0000051 ?part . graph <http://reasoner.renci.org/redundant> { ?part rdfs:subClassOf UBERON:0001062 . } ?part rdfs:label ?partlabel . } """ results = self.triplestore.query_template( inputs = { 'anatomy_id': anatomy_identifier }, \ outputs = [ 'part', 'partlabel' ], \ template_text = text \ ) for result in results: result['curie'] = Text.obo_to_curie(result['part']) return results
def build_exact_sets(o,u): sets = [] mids = o.get_ids() print(len(mids)) n = 0 now = dt.now() for mid in mids: if n % 100 == 0 and n > 0: later = dt.now() delt = (later-now).seconds f = n / len(mids) print(f'{n}/{len(mids)} = {f} in {delt} s') print(f' estimated time remaining = {delt * (1-f)/(f)}') #FWIW, ICD codes tend to be mapped to multiple MONDO identifiers, leading to mass confusion. So we #just excise them here. It's possible that we'll want to revisit this decision in the future. If so, #then we probably will want to set a 'glommable' and 'not glommable' set. print(mid) dbx = [ Text.upper_curie(x) for x in o.get_exact_matches(mid) ] print(dbx) dbx = set( filter( lambda x: not x.startswith('ICD'), dbx ) ) label = u.get_label(mid) print(label) mid = Text.upper_curie(mid) dbx.add(LabeledID(mid,label)) sets.append(dbx) n += 1 return sets
def term_to_term(self,node_a,node_b,limit = 10000): """Given two diseases, check the co-occurrence """ icd9_a = list(filter( lambda x: x.startswith('ICD9'), node_a.synonyms ) ) icd9_b = list(filter( lambda x: x.startswith('ICD9'), node_b.synonyms ) ) if (len(icd9_a) == 0) or (len(icd9_b) == 0): #can't do co-occurence unless we get icd9 codes return co_occurrences = [] for icd9a_curie in icd9_a: icd9a = Text.un_curie(icd9a_curie) if icd9a not in self.icd9_codes: logging.getLogger('application').debug('Dont have data for {}'.format(icd9a)) continue for icd9b_curie in icd9_b: icd9b = Text.un_curie(icd9b_curie) if icd9b not in self.icd9_codes: logging.getLogger('application').debug('Dont have data for {}'.format(icd9b)) continue #Now we have nodes that both have ICD9 codees and the both map to our results! k = (icd9a, icd9b) if k not in self.icd9_paircounts: #There were less than 11 shared counts. counta = self.icd9_codes[icd9a] countb = self.icd9_codes[icd9b] expected = float(counta) * float(countb) / self.total co_occurrences.append( (k, {'c1': counta, 'c2': countb, 'c': '<11', 'e': expected, 'p':None}) ) else: co_occurrences.append( (k, self.icd9_paircounts[k] ) ) if len(co_occurrences) > 0: return self.make_edge(co_occurrences, node_a, node_b) return None
def gene_to_drug_expanded(self, gene_node): output = [] identifiers = gene_node.get_synonyms_by_prefix('NCBIGENE') for identifier in identifiers: unique = set() geneid = Text.un_curie(identifier) url = f"{self.url}CTD_chem_gene_expanded_geneID/ncbigene:{geneid}/" obj = requests.get (url).json () for r in obj: good_row, predicate_label, props, pmids = self.check_expanded_gene_chemical_row(r) if not good_row: continue predicate = self.normalize_predicate( LabeledID(identifier=f"CTD:{Text.snakify(predicate_label)}", label=predicate_label) ) #Should this be substance? drug_node = KNode(Text.upper_curie(r['chemicalID']), type=node_types.CHEMICAL_SUBSTANCE, name=r['chem_label']) direction = r['direction'] if direction == '->': subject = drug_node object = gene_node else: subject = gene_node object = drug_node edge = self.create_edge(subject,object,'ctd.gene_to_drug_expanded',identifier,predicate,properties = props,url=url,publications=pmids) #This is what we'd like it to be, but right now there's not enough real specificity on the predicates #key = (drug_node.id, edge.standard_predicate.label) key = (drug_node.id, edge.original_predicate.label) if key not in unique: output.append( (edge,drug_node) ) unique.add(key) return output
def term_get_ancestors(self, node_type, root_iri): results = self.triplestore.query_template( template_text=self.query, inputs={'root_uri': root_iri}, outputs=['parent_id', 'parent_label', 'child_id', 'child_label']) print('found total ', len(results), ' results.') nodes = set() edges = set() for index, row in enumerate(results): # Output type would be same as input type? ancestor_node = KNode(Text.obo_to_curie(row['parent_id']), name=row['parent_label'], type=node_type) child_node = KNode(Text.obo_to_curie(row['child_id']), name=row['child_label'], type=node_type) if ancestor_node.id == child_node.id: # refrain from adding edge to the node itself continue predicate = LabeledID(identifier='rdfs:subClassOf', label='subclass of') edge = self.create_edge( source_node=child_node, target_node=ancestor_node, predicate=predicate, provided_by='uberongraph.term_get_ancestors', input_id=child_node.id) nodes.add(child_node) nodes.add(ancestor_node) edges.add(edge) return nodes, edges
def normalize(self, node): """Given a node, which will have many potential identifiers, choose the best identifier to be the node ID, where 'best' is defined by the order in which identifiers appear in the id prefix configurations within the concept model.""" #If we have two synonyms with the same id, but one has no label, chuck it smap = defaultdict(list) for labeledid in node.synonyms: smap[labeledid.identifier].append(labeledid.label) for lid, labels in smap.items(): if len(labels) > 1 and (None in labels): node.synonyms.remove(LabeledID(identifier=lid, label=None)) if len(labels) > 1 and ('' in labels): node.synonyms.remove(LabeledID(identifier=lid, label='')) #Now find the bset one for an id type_curies = self.concepts.get(node.type).id_prefixes #Now start looking for the best curies synonyms_by_curie = defaultdict(list) for s in node.synonyms: c = Text.get_curie(s.identifier) synonyms_by_curie[c].append(s) for type_curie in type_curies: potential_identifiers = synonyms_by_curie[type_curie] if len(potential_identifiers) > 0: if len(potential_identifiers) > 1: pis = [ f'{pi.identifier}({pi.label})' for pi in potential_identifiers ] ids_with_labels = list( filter(lambda x: x.label is not None, potential_identifiers)) if len(ids_with_labels) > 0: potential_identifiers = ids_with_labels potential_identifiers.sort() node.id = potential_identifiers[0].identifier #Only replace the label if we have a label. if potential_identifiers[0].label != '': node.name = potential_identifiers[0].label break #Remove any synonyms with extraneous prefixes. The point of this is not so much to remove # unknown prefixes, as to make sure that if we got e.g. a meddra, and we downcast it to a disease, # that we don't end up with HP's in the equivalent ids. bad_synonyms = set() for synonym in node.synonyms: if isinstance(synonym, LabeledID): prefix = Text.get_curie(synonym.identifier) else: prefix = Text.get_curie(synonym) if prefix not in type_curies: bad_synonyms.add(synonym) for bs in bad_synonyms: node.synonyms.remove(bs) if node.id.startswith('DOID'): logger.warn("We are ending up with a DOID here") logger.warn(node.id) logger.warn(node.synonyms) logger.warn(node.type)
def graph_drugname_to_gene_symbol(self, drug_name_node): drug_name = Text.un_curie(drug_name_node.identifier) response = self.drug_name_to_gene_symbol(drug_name) results = [] for r in response: edge = self.get_edge(r, predicate="targets") node = KNode("UNIPROT:{0}".format(Text.path_last(r['uniprotSym'])), node_types.GENE) results.append((edge, node)) return results
def graph_name_to_drugbank(self, drug_name_node): drug_name = Text.un_curie(drug_name_node.identifier) response = self.drug_name_to_gene_symbol(drug_name) results = [] for r in response: edge = self.get_edge(r, predicate="drugname") node = KNode ("DRUGBANK:{0}".format (Text.path_last (r['drugID'])), \ node_types.DRUG, \ label=r['drugName']) results.append((edge, node)) return results
def build_sets(o, ignore_list = ['ICD']): sets = [] mids = o.get_ids() for mid in mids: #FWIW, ICD codes tend to be mapped to multiple MONDO identifiers, leading to mass confusion. So we #just excise them here. It's possible that we'll want to revisit this decision in the future. If so, #then we probably will want to set a 'glommable' and 'not glommable' set. dbx = set([Text.upper_curie(x) for x in o.get_xrefs(mid) if not reduce(lambda accumlator, ignore_prefix: accumlator or x.startswith(ignore_prefix) , ignore_list, False)]) dbx = set([norm(x) for x in dbx]) label = o.get_label(mid) mid = Text.upper_curie(mid) dbx.add(LabeledID(mid,label)) sets.append(dbx) return sets
def synonymize(node,gt): if not node.type == node_types.GENE: raise Exception("Incorrect node type") if Text.get_curie(node.id).upper() == 'UNIPROTKB': new_ids = gt.uniprot.get_synonyms(node.id) if len(new_ids) > 0: labeled_ids = [ LabeledID(identifier=h, label='') for h in new_ids ] node.synonyms.update(labeled_ids) node.id = new_ids[0] if Text.get_curie(node.id).upper() != 'UNIPROTKB': g_synonyms = gt.hgnc.get_synonyms(node.id) else: g_synonyms = set() return g_synonyms
def add_chemotext_terms(self,nodes): """For each mesh term in a node, find out what chemotext calls that thing so we can query for it""" logging.getLogger('application').debug('{} nodes'.format(len(nodes) )) for node in nodes: logging.getLogger('application').debug('node: {}'.format(node.id) ) mesh_identifiers = list( filter( lambda x: Text.get_curie(x)=='MESH', node.synonyms)) for mesh_id in mesh_identifiers: logging.getLogger('application').debug(' mesh_id: {}'.format(mesh_id) ) bare_id = Text.un_curie(mesh_id) cterm = self.ctext.get_chemotext_term_from_meshid( bare_id ) if cterm is None: logging.getLogger('application').warn(" Cannot find chemotext synonym for %s (%s) %s" % (bare_id,mesh_id,node.id)) else: logging.getLogger('application').debug(' node: {}, label: {}, chemotext: {}'.format(node.id, bare_id, cterm) ) self.identifier_to_label[node.id].append(cterm)
def gene_get_drug(self, gene_node): """ Get a drug from a gene. """ resolved_edge_nodes = [] identifiers = gene_node.get_synonyms_by_prefix('UNIPROTKB') for s in identifiers: try: logger.debug(f'Call with {s}') pharosid = Text.un_curie(s) original_edge_nodes = [] url = 'https://pharos.nih.gov/idg/api/v1/targets(%s)?view=full' % pharosid r = requests.get(url) try: result = r.json() logger.debug('back') except: #If pharos doesn't know the identifier, it just 404s. move to the next logger.debug('404') continue actions = set() # for testing predicate = LabeledID(identifier='PHAROS:drug_targets', label='is_target') chembl_id = None for link in result['links']: if link['kind'] == 'ix.idg.models.Ligand': pharos_drug_id = link['refid'] chembl_id, label = self.drugid_to_identifiers(pharos_drug_id) if chembl_id is not None: drug_node = KNode(chembl_id, type=node_types.CHEMICAL_SUBSTANCE, name=label) edge = self.create_edge(drug_node,gene_node, 'pharos.gene_get_drug', pharosid,predicate, url=url) resolved_edge_nodes.append( (edge,drug_node) ) except: logger.debug("Error encountered calling pharos with",s) logger.debug('ok') return resolved_edge_nodes
def graph_drugbank_to_uniprot(self, drugbank): response = self.triplestore.query_template(inputs={ "drugID": "DB{0}".format(Text.un_curie(drugbank.identifier)) }, outputs=["uniprotGeneID"], template_text=""" prefix drugbank: <http://chem2bio2rdf.org/drugbank/resource/> prefix drugbank_drug: <http://chem2bio2rdf.org/drugbank/resource/drugbank_drug/> prefix ctd: <http://chem2bio2rdf.org/ctd/resource/> select distinct ?uniprotGeneID where { values ( ?drugID ) { ( drugbank_drug:${drugID} ) } ?dbInter drugbank:GeneBank_ID ?geneBankID ; drugbank:gene ?uniprotGeneID . ?drugID drugbank:CID ?pubchemCID ; drugbank:Generic_Name ?drugGenericName . ?ctd_disease ctd:diseaseid ?diseaseID ; ctd:cid ?pubchemCID . }""") predicate = LabeledID(identifier='SIO:001257', label='chemical to gene association') results = [] for r in response: node = KNode("UNIPROT:{0}".format( r['uniprotGeneID'].split('/')[-1:][0]), type=node_types.GENE) edge = self.create_edge(drugbank, node, 'chembio.graph_drugbank_to_uniprot', predicate, drugbank.id) results.append(edge, node) return results
def disease_get_gene(self, disease_node): """ Get a gene from a pharos disease id.""" resolved_edge_nodes = [] hgncs = set() # WD:P2293 gene assoc with condition. # domain is gene and range is disease or phenotype for this relationship predicate = LabeledID(identifier='WD:P2293', label='gene_involved') #Pharos contains multiple kinds of disease identifiers in its disease table: # For OMIM identifiers, they can have either prefix OMIM or MIM # UMLS doen't have any prefixes.... :( pharos_predicates = {'DOID':('DOID',),'UMLS':(None,),'MESH':('MESH',),'OMIM':('OMIM','MIM'),'ORPHANET':('Orphanet',)} for ppred,dbpreds in pharos_predicates.items(): pharos_candidates = [Text.un_curie(x) for x in disease_node.get_synonyms_by_prefix(ppred)] for dbpred in dbpreds: if dbpred is None: pharos_ids = pharos_candidates else: pharos_ids = [f'{dbpred}:{x}' for x in pharos_candidates] for pharos_id in pharos_ids: cursor = self.db.cursor(dictionary = True, buffered = True) query = f"select distinct x.value, p.sym from disease d join xref x on x.protein_id = d.target_id join protein p on d.target_id = p.id where x.xtype = 'HGNC' and d.dtype <> 'Expression Atlas' and d.did='{pharos_id}';" cursor.execute(query) for result in cursor: label = result['sym'] hgnc = result['value'] if hgnc not in hgncs: hgncs.add(hgnc) gene_node = KNode(hgnc, type=node_types.GENE, name=label) edge = self.create_edge(gene_node, disease_node, 'pharos.disease_get_gene', pharos_id, predicate) resolved_edge_nodes.append((edge, gene_node)) return resolved_edge_nodes
def gene_to_disease(self, gene): if not Text.get_curie( gene.identifier) in ['HGNC', 'UNIPROT', 'PHAROS']: return [] result = self.query( "MATCH (d:Disease)-[a1]-(g:Gene) WHERE g.name='{0}' RETURN a1,d". format(Text.un_curie(gene.identifier)), labels=['Disease']) # result = self.nodes_and_edges (result) for r in result: print(r) print(result) print(type(result)) #print ("-------------------> {}".format (json.dumps (result, indent=2))) return [(self.get_edge({'res': r}, predicate='affects'), KNode(r['identifier'], node_types.DISEASE)) for r in result]
def drug_get_gene(self, subject): """ Get a gene from a pharos disease id. """ pharosid = Text.un_curie (subject.identifier) original_edge_nodes=[] r = requests.get('https://pharos.nih.gov/idg/api/v1/ligands(%s)?view=full' % pharosid) result = r.json() resolved_edge_nodes = [] actions = set() #for testing for link in result['links']: if link['kind'] == 'ix.idg.models.Target': pharos_target_id = int(link['refid']) edge_properties = {} for prop in link['properties']: if prop['label'] == 'Pharmalogical Action': #! actions.add(prop['term'] ) pharos_edge = KEdge( 'pharos', 'drug_get_gene', {'properties': link['properties']} ) #Pharos returns target ids in its own numbering system. Collect other names for it. hgnc = self.target_to_hgnc (pharos_target_id) if hgnc is not None: hgnc_node = KNode (hgnc, node_types.GENE) resolved_edge_nodes.append( (pharos_edge, hgnc_node) ) else: logging.getLogger('application').warn('Did not get HGNC for pharosID %d' % pharos_target_id) # for a in actions: # print ('Action: {}'.format(a) ) return resolved_edge_nodes
def gene_to_drug(self, gene_node): output = [] identifiers = gene_node.get_synonyms_by_prefix('NCBIGENE') for identifier in identifiers: unique = set() geneid = Text.un_curie(identifier) url = f"{self.url}/CTD_chem_gene_ixns_GeneID/{geneid}/" obj = requests.get (url).json () for r in obj: if r['GeneID'] != geneid: continue good_row, predicate_label, props = self.check_gene_chemical_row(r) if not good_row: continue predicate = self.normalize_predicate( LabeledID(identifier=f'CTD:{predicate_label}', label=predicate_label) ) #Should this be substance? drug_node = KNode(f"MESH:{r['ChemicalID']}", type=node_types.CHEMICAL_SUBSTANCE, name=f"{r['ChemicalName']}") if sum([s in predicate.identifier for s in self.g2d_strings]) > 0: subject = gene_node obj = drug_node else: subject = drug_node obj = gene_node edge = self.create_edge(subject,obj,'ctd.gene_to_drug',identifier,predicate, publications=[f"PMID:{x}" for x in r['PubMedIDs'].split('|') ],url=url,properties=props) #This is what we'd like it to be, but right now there's not enough real specificity on the predicates #key = (drug_node.id, edge.standard_predicate.label) key = (drug_node.id, edge.original_predicate.label) if key not in unique: output.append( (edge,drug_node) ) unique.add(key) return output
def go_term_to_cell_xontology_relationships(self, go_node): #This call is not paged! url = "{0}/QuickGO/services/ontology/go/terms/GO:{1}/xontologyrelations".format( self.url, Text.un_curie(go_node.id)) response = self.query(url) if 'results' not in response: return [] results = [] for r in response['results']: if 'xRelations' in r: for xrel in r['xRelations']: if xrel['id'].startswith('CL:'): predicate = self.get_predicate(xrel['relation']) if predicate is None: continue cell_node = KNode(xrel['id'], type=node_types.CELL, name=xrel['term']) edge = self.create_edge( go_node, cell_node, 'quickgo.go_term_to_cell_xontology_relationships', go_node.id, predicate, url=url) results.append((edge, cell_node)) return results
def __init__(self, *args, **kwargs): self.id = None self.name = None self.type = None self.original_curie = None self.properties = {} if args and len(args) == 1 and isinstance(args[0], str): self.id = args[0] args = [] # TODO: Currently hack to only utilize the 1st curie in a list if multiple curies provided elif args and len(args) == 1 and isinstance( args[0], list) and isinstance(args[0][0], str): self.id = args[0][0] args = [] super().__init__(*args, **kwargs) # Another hack to keep things running. if isinstance(self.name, list): self.name = self.name[0] if self.id.startswith('http'): self.id = Text.obo_to_curie(self.id) #Synonyms is just for CURIEs self.synonyms = set() self.synonyms.add(LabeledID(identifier=self.id, label=self.name)) #List of labels to attach to exports self.export_labels = []
async def get_kegg_data(self, kegg_id): conf = self.get_prefix_config('KEGG.COMPOUND') kegg_c_id = Text.un_curie(kegg_id) url = conf['url'] + kegg_c_id response = await self.async_get_text(url) kegg_dict = self.parse_flat_file_to_dict(response) return self.extract_kegg_data(kegg_dict, conf['keys'])
async def get_pubchem_data(self, pubchem_id, retries = 0): """ Gets pubchem annotations. """ conf = self.get_prefix_config('PUBCHEM') url = conf['url'] + pubchem_id.split(':')[-1] headers = { 'Accept': 'application/json' } result = await self.async_get_raw_response(url, headers= headers) # async with result as result_json: result_json = result['json'] # pubmed api blocks if too many req are sent throttle = result['headers']['X-Throttling-Control'] throttle_warnings = { Text.snakify(value.split(':')[0].lower()) : value.split(':')[1] for value in throttle.split(',') if ':' in value } if 'Yellow' in throttle_warnings['request_time_status'] or 'Yellow' in throttle_warnings['request_count_status']: logger.warn('Pubchem requests reached Yellow') await asyncio.sleep(0.5) if 'Red' in throttle_warnings['request_time_status'] or 'Red' in throttle_warnings['request_count_status']: logger.warn('Pubchem requests reached RED') await asyncio.sleep(2) if 'Black' in throttle_warnings['request_time_status'] or 'Black' in throttle_warnings['request_count_status']: sleep_sec = 3 * ( retries + 1 ) # logger.error(f'Pubchem request blocked, sleeping {sleep_sec} seconds, no of retries {retries}') await asyncio.sleep(sleep_sec) # repeat call if retries has changed till 3 if retries < 3: return await self.get_pubchem_data(pubchem_id, retries + 1) else: # exceeding retries return {} logger.warn(f'retry limit exceed for {pubchem_id} , returning empty') return {} return self.extract_pubchem_data(result_json, conf['keys'])
async def get_chemical_roles(self, chebi_id): """ Gets all the roles assigned to a chebi id. Should return along result along chebi_id, useful when making bulk request concurrently to keep track. """ text = """ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX has_role: <http://purl.obolibrary.org/obo/RO_0000087> PREFIX chemical_entity: <http://purl.obolibrary.org/obo/CHEBI_24431> PREFIX CHEBI: <http://purl.obolibrary.org/obo/CHEBI_> SELECT DISTINCT ?role_label from <http://reasoner.renci.org/ontology> from <http://reasoner.renci.org/redundant> where { $chebi_id has_role: ?role. ?role rdfs:label ?role_label. GRAPH <http://reasoner.renci.org/ontology/closure> { ?role rdfs:subClassOf CHEBI:50906. } } """ query_result = await self.tripleStore.async_query_template( inputs = {'chebi_id': chebi_id}, outputs = [ 'role_label' ], template_text = text ) for r in query_result: r['role_label'] = Text.snakify(r['role_label']) return {chebi_id: query_result}
def drug_get_gene(self, subject): """ Get a gene from a drug. """ resolved_edge_nodes = [] identifiers = subject.get_synonyms_by_prefix('CHEMBL.COMPOUND') for s in identifiers: pharosid = Text.un_curie(s) original_edge_nodes = [] url = 'https://pharos.nih.gov/idg/api/v1/ligands(%s)?view=full' % pharosid r = requests.get(url) try: result = r.json() except: #Pharos returns a 404 if it doesn't recognize the identifier, which ends up producing # errors in turning into json. Skip to next identifier continue actions = set() # for testing predicate = LabeledID(identifier='PHAROS:drug_targets', label='is_target') for link in result['links']: if link['kind'] == 'ix.idg.models.Target': pharos_target_id = int(link['refid']) hgnc = self.target_to_hgnc(pharos_target_id) if hgnc is not None: hgnc_node = KNode(hgnc, type=node_types.GENE) edge = self.create_edge(subject,hgnc_node,'pharos.drug_get_gene',pharosid,predicate,url=url) resolved_edge_nodes.append((edge, hgnc_node)) else: logging.getLogger('application').warn('Did not get HGNC for pharosID %d' % pharos_target_id) return resolved_edge_nodes
def drug_to_gene_expanded(self, drug): output = [] identifiers = drug.get_synonyms_by_prefix('MESH') for identifier in identifiers: url=f"{self.url}CTD_chem_gene_expanded_chemicalID/mesh:{Text.un_curie(identifier)}/" result = requests.get(url) obj=result.json() for r in obj: good_row, predicate_label, props, pmids = self.check_expanded_gene_chemical_row(r) if not good_row: continue predicate = self.normalize_predicate( LabeledID(identifier=f"CTD:{Text.snakify(predicate_label)}", label=predicate_label) ) gene_node = KNode(Text.upper_curie(r['geneID']), name=r['gene_label'],type=node_types.GENE) direction = r['direction'] if direction == '->': subject = drug object = gene_node else: subject = gene_node object = drug edge = self.create_edge(subject,object,'ctd.drug_to_gene_expanded',identifier,predicate,publications=pmids,properties=props,url=url ) output.append( (edge,gene_node) ) return output
def map_concept_types(self, thing, object_type=None): """ Expand high level concepts into concrete types our data sources understand. """ # Try the CURIE approach. the_type = self.guess_type( thing.identifier) if thing and thing.identifier else None # If that didn't work, get candiddate types based on the (abstract) node type. if thing and not the_type: the_type = self.concepts.get(thing.node_type, None) if the_type: # Attempt to map them down to IRIs the_type = [self.vocab.get(t, t) for t in the_type] # Systematize this: # If the concept type is disease but the curie is NAME, we don't have a DOID. if isinstance(the_type, str): # If we've ended up with just one string, make it a list for conformity of return type the_type = [the_type] result = the_type if the_type else self.concepts.get( object_type, [object_type]) curie = Text.get_curie(thing.identifier) if thing else None if curie: result = [self.make_up_curie(curie)] #[ self.vocab[curie] ] #result = [ self.vocab[curie] ] return result
async def get_inchikey_data(self, inchikey_id): conf = self.get_prefix_config('INCHIKEY') inchikey_c_id = Text.un_curie(inchikey_id) url = conf['url'] + inchikey_c_id response = await self.async_get_text(url) inchikey_dict = self.parse_flat_file_to_dict(response) return self.extract_inchikey_data(inchikey_dict, conf['keys'])
def execute(self): """Execute the query that defines the graph""" self.logger.debug('Executing Query') #GreenT wants a cypherquery to find transitions, and a starting point cyphers = self.userquery.generate_cypher() starts = self.userquery.get_start_node() reverses = self.userquery.get_reversed() lookups = self.userquery.get_lookups() for cypher, start, reverse,lookup in zip(cyphers,starts,reverses,lookups): input_name = Text.un_curie(lookup.identifier) self.logger.debug(start) self.logger.debug('CYPHER') self.logger.debug(cypher) identifier, ntype = start start_node = KNode( identifier, ntype, label=input_name ) kedge = KEdge( 'lookup', 'lookup' ) kedge.source_node = lookup kedge.target_node = start_node self.add_nonsynonymous_edge( kedge ) #Fire this to rosetta, collect the result result_graph = self.rosetta.graph([(None, start_node)],query=cypher) #result_graph contains duplicate edges. Remove them, while preserving order: result_graph = list(OrderedDict.fromkeys( result_graph ) ) self.add_edges( result_graph , reverse ) self.logger.debug('Query Complete')
def go_term_to_cell_annotation_extensions(self, go_node): """This is playing a little fast and loose with the annotations. Annotations relate a gene to a go term, and they can have an extension like occurs_in(celltype). Technically, that occurs_in only relates to that particular gene/go combination. But it's the only way to traverse from neurotransmitter release to neurons that is currently available""" url = '{0}/QuickGO/services/annotation/search?includeFields=goName&goId=GO:{1}&taxonId=9606&extension=occurs_in(CL)'.format( self.url, Text.un_curie(go_node.id)) call_results = self.page_calls(url) cell_ids = set() results = [] for r in call_results: for e in r['extensions']: for c in e['connectedXrefs']: if c['db'] == 'CL': if c['id'] not in cell_ids: predicate = self.get_predicate(c['qualifier']) if predicate is None: continue #Bummer, don't get a name cell_node = KNode('CL:{}'.format(c['id']), type=node_types.CELL) edge = self.create_edge( go_node, cell_node, 'quickgo.go_term_to_cell_annotation_extensions', go_node.id, predicate, url=url) results.append((edge, cell_node)) cell_ids.add(c['id']) return results
def chemical_get_enzyme(self,chemnode): """To get an enzyme from chemicals, we first look up the reactions for the chemical. Then we pull the reaction which gives us (1) the enzyme and (2) whether the chemical is a reactant or a product.""" reactions = self.chemical_get_reaction(chemnode) chemids = set([Text.un_curie(x) for x in chemnode.get_synonyms_by_prefix('KEGG')]) results = [] for reaction_id in reactions: rxns = self.get_reaction(reaction_id) for rxn in rxns: if 'enzyme' in rxn: for gene_id in rxn['enzyme']: enzyme = KNode(gene_id, type=node_types.GENE) if len(chemids.intersection(rxn['reactants'])) > 0: predicate = LabeledID('CTD:increases_degradation_of', label='increases degradation of') #predicate = LabeledID('RO:0002449','negatively regulates, entity to entity') input_identifier = chemids.intersection(rxn['reactants']).pop() elif len(chemids.intersection(rxn['products'])) > 0: predicate = LabeledID('CTD:increases_synthesis_of', label='increases synthesis of') #predicate = LabeledID('RO:0002450','positively regulates, entity to entity') input_identifier = chemids.intersection(rxn['products']).pop() else: logger.error(f"Mismatch between query and answer: {rxn} {chemids}") continue edge = self.create_edge(enzyme, chemnode, f'kegg.chemical_get_enzyme', input_identifier, predicate) results.append( (edge, enzyme)) return results
def get_transitions(self, graph, query): """ Execute a cypher query and walk the results to build a set of transitions to execute. The query should be such that it returns a path (node0-relation0-node1-relation1-node2), and an array of the relation start nodes. For the path above, start nodes like (node0,node1) would indicate a unidirectional path, while (node0,node2) would indicate an end-based path meeting in the middle. Each node in the path can be described with an arbitrary node index. Note that this index does not have to correspond to the order of calling or any structural property of the graph. It simply points to a particular node in the call map. Returns: nodes: A map from a node index to the concept. transitions: a map from a node index to an (operation, output index) pair """ with graph.driver.session() as session: result = session.run(query) plans = [] for row in result: nodes = row['nodes'] edges = row['edges'] # extract transitions transitions = { node_id: {node_id: [] for node_id in nodes} for node_id in nodes } for e in edges: edge = edges[e] source_id = edge['source'] target_id = edge['target'] qedge = next(e2 for e2 in self.query_graph['edges'] if e2.id == e) qedge_type = qedge.type predicate = [ Text.snakify(e2type) for e2type in qedge_type ] if isinstance( qedge_type, list) and qedge_type else Text.snakify( qedge_type) if isinstance(qedge_type, str) else None trans = { "op": edge['op'], "link": edge['predicate'], "predicate": predicate } transitions[source_id][target_id].append(trans) plans.append(transitions) return plans