def test2(): from service import ServiceContext oxo = OXO(ServiceContext.create_context()) r = oxo.efo_to_doid(KNode('EFO:0000764', node_types.DISEASE)) print(r)
def test_metabolite_to_enzyme(hmdb): hete = KNode('HMDB:HMDB0011134', type=node_types.CHEMICAL_SUBSTANCE) results = hmdb.metabolite_to_enzyme(hete) assert len(results) > 0 node_ids = [node.id for edge, node in results] assert 'UniProtKB:Q96SL4' in node_ids
def get(self, api_name, node, method_metadata): """ Invoke a GET requests on the specified API for value node with the given metadata. """ result = [] try: """ Find synonym in the input node of an appropriate input type for this operation. """ input_arg = None input_type = None for synonym in node.synonyms: #print (f"synonym -> {synonym}") syn = self.identifiers.curie_instance2id(synonym) print(f"syn -> {syn}") print(f"syn -> {method_metadata.in_types}") for t in method_metadata.in_types: if input_arg: break if t in syn: input_arg = synonym.split(':')[1] input_type = t break """ Fail if no supplied synonym is of an appropriate type to make the call. """ if not input_arg: raise ValueError( f"Node {node} contains no synonyms of type {method_metadata.in_types} required by operation {method_metadata.op}" ) """ Get the service metadata """ service_metadata = self.get_service_metadata( api_name, input_type, method_metadata.out_type) logger.debug( "* Executing translator registry method: {0} in: {1} out: {2} template: {3} value: {4} " .format(api_name, input_type, method_metadata.out_type, service_metadata.get_url, node)) """ Parameterize and execute the HTTP request. """ url = Template(service_metadata.get_url).render(input=input_arg) response = requests.get(url).json() #with open ("a.txt", "w") as stream: # stream.write (json.dumps (response, indent=2)) """ Expand the context with JSON-LD """ jsonld_context = json.loads(json.dumps(service_metadata.jsonld), parse_float=lambda v: str(v)) del jsonld_context['@context']['@version'] expanded = jsonld.expand( response, {"expandContext": jsonld_context['@context']}) """ Extract data from the returned JSON object. """ """ TODO: Responses are complex. Figure out how to generalize * Traversal of the response * Decisions about how to create nodes and edges * What to say about the semantic types of returned identifiers """ print(json.dumps(expanded, indent=2)) for obj in expanded: for predicate, v in obj.items(): if isinstance(v, list): for item in v: val = item["@id"] if "@id" in item else None if val: curie = self.identifiers.instance2curie(val) #print (f"val: {val} curie: {curie}") out_concept = method_metadata.out_concept node_type = get_node_type(out_concept) if curie and node_type: #print (f" ------> node type {node_type} id {val} ") new_node = KNode(curie, type=node_type) result.append( (self.new_edge(source=self.name, function=predicate, properties=response, source_node=node, target_node=new_node), new_node)) except Exception as e: traceback.print_exc() exc_type, exc_value, exc_tb = sys.exc_info() exception_text = traceback.format_exception( exc_type, exc_value, exc_tb) logger.error(exception_text) return result
def x_test_diabetes_to_metabolite(rosetta, hmdb): diabetes = KNode('MONDO:0005148', type=node_types.DISEASE) rosetta.synonymizer.synonymize(diabetes) print(diabetes.synonyms) results = hmdb.disease_to_metabolite(diabetes) assert len(results) > 0
def test_enzyme_to_metabolite(hmdb): asthma = KNode('UniProtKB:Q96SL4', type=node_types.GENE) results = hmdb.enzyme_to_metabolite(asthma) assert len(results) > 0 node_labels = [node.name for edge, node in results] assert '5-HETE' in node_labels
def name_to_efo (self, name): result = [] response = self.request_concept (name) seen = {} for r in response: for a in r['aliases']: if a.startswith ("EFO:"): if not a in seen: logger.debug (" -- appending a {}".format (a)) result.append ( ( self.get_edge (r, predicate='name_to_efo'), KNode(a, node_types.DISEASE ) ) ) seen[a] = a return result
def name_to_mesh (self, name): result = [] response = self.request_concept (name) seen = {} for r in response: for a in r['aliases']: if a.startswith ("MESH:"): if not a in seen: #TODO: not sure what node type should be here... result.append ( ( self.get_edge (r, predicate='name_to_mesh'), KNode(a, name.node_type) ) ) seen[a] = a return list(set(result))
def test(): from greent.service import ServiceContext hgnc = HGNC( ServiceContext.create_context() ) input_knode = KNode( 'NCBIGENE:3815' , node_type = node_types.GENE ) print( hgnc.ncbigene_to_uniprotkb( input_knode ) )
def test_imatinib_asthma(omnicorpus): drug_node = KNode('CHEBI:45783', type=node_types.DRUG) disease_node = KNode('MONDO:0004979', type=node_types.DISEASE) pmids = omnicorpus.get_shared_pmids(drug_node, disease_node) assert len(pmids) > 0 assert 'PMID:15374841' in pmids
def test_non_HP_pheno_to_anatomy(uberon): #Arrhythmia occurs in... k = KNode('xx:0011675', type=node_types.PHENOTYPIC_FEATURE) results = uberon.get_anatomy_by_phenotype_graph(k) assert len(results) == 0
def test_disease_to_anatomy_rc_face(uberon): k = KNode('MONDO:0022407', type=node_types.DISEASE) results = uberon.get_anatomy_by_disease(k) newnodes = [node.id for edge, node in results] print(newnodes) assert 'UBERON:0001456' in newnodes
def test_pheno_to_anatomy_7354(uberon): #Arrhythmia occurs in... k = KNode('HP:0007354', type=node_types.PHENOTYPIC_FEATURE) results = uberon.get_anatomy_by_phenotype_graph(k) assert len(results) > 0
def execute_knowledge_graph_program(self, inputs, program): """ Construct a knowledge graph given a set of input nodes and a program - a list of frames, each of which contains the name of a concept, a collector containing a list of edges and nodes where all target nodes are instances of the frame's concept, and a list of operations for transitioning from one frame's concept space to the next frames. This method assumes a linear path. """ """ Convert inputs to be structured like edges-and-nodes returned by a previous services. """ next_nodes = { key: [(None, KNode(val, type=key)) for val in val_list] for key, val_list in inputs.items() } logger.debug(f"inputs: {next_nodes}") """ Validated the input program. """ if len(program) == 0: logger.info(f"No program found for {query}") return [] logger.info(f"program> {program}") result = [] """ Each frame's name is a concept. We use the top frame's as a key to index the arguments. """ top_frame = program[0] inputs = next_nodes[top_frame.name] for i in inputs: self.synonymizer.synonymize(i[1]) """ Stack is the overall executable. We prepend a base frame with a collector primed with input arguments. """ stack = [Frame(collector=inputs)] + program """ Execute the program frame by frame. """ for index, frame in enumerate(program): # logger.debug (f"--inputs: {stack[index].collector}") for k, o in frame.ops.items(): logger.debug(f"-- frame-index--> {frame} {index} {k}=>{o.op}") """ Process each node in the collector. """ index = 0 for edge, source_node in stack[index].collector: """ Process each operator in the frame. """ for op_name, operator in frame.ops.items(): """ Generate a cache key. """ key = f"{operator.op}({source_node.id})" try: logger.debug(f" --op: {key}") """ Load the object from cache. """ response = self.cache.get(key) if not response: """ Invoke the knowledge source with the given input. """ op = self.get_ops(operator.op) if not op: raise Exception( f"Unable to find op: {operator.op}") response = op(source_node) for edge, node in response: """ Process the edge adding metadata. """ if isinstance(edge, KEdge): edge.predicate = operator.predicate edge.source_node = source_node self.synonymizer.synonymize(node) edge.target_node = node """ Validate the id space of the returned data maps to the target concept. """ if index < len(program) - 1: target_concept_name = program[index + 1].name prefixes = self.type_graph.concept_model.get( target_concept_name).id_prefixes valid = any([ node.id.upper().startswith(p.upper()) for p in prefixes ]) if not valid: logger.debug( f"Operator {operator} wired to type: {concept_name} returned node with id: {node.id}" ) """ Cache the annotated and validated response. """ self.cache.set(key, response) """ Add processed edges to the overall result. """ result += [edge for edge, node in response] logger.debug(f"{key} => {Text.short(response)}") """ Response edges go in the collector to become input for the next operation. """ frame.collector += response except Exception as e: traceback.print_exc() logger.warning("Error invoking> {key}") logger.debug(f"returning {len(result)} values.") return result
def compile_results(self, fname, ntype, searchResults): result = [] for other in searchResults: result.append((KEdge('oxo', fname, is_synonym=True), KNode(identifier=other['curie'], node_type=ntype))) return result
def test_huntington_is_genetic(mondo2): huntington = KNode('OMIM:143100', type=node_types.DISEASE) assert mondo2.is_genetic_disease(huntington)
def test_two_disease(omnicorpus): disease1 = KNode('MONDO:0005090', type=node_types.DISEASE) disease2 = KNode('MONDO:0003425', type=node_types.DISEASE) pmids = omnicorpus.get_shared_pmids(disease1, disease2) assert len(pmids) > 0
def name_to_doid (self, name): result = [] response = self.request_concept (name, node_types.DISEASE) seen = {} for r in response: got_doid = False for a in r['aliases']: if a.startswith ("DOID:"): got_doid = True if not a in seen: logger.debug (" -- appending a {}".format (a)) result.append ( ( self.get_edge (r, predicate='name_to_doid'), KNode(a, node_types.DISEASE ) ) ) seen[a] = a logger.info("Returning {} doids".format(len(result))) return result
def test_list_returns_zero(omnicorpus): disease_node = KNode('UBERON:0013694', type=node_types.ANATOMY) go_node = KNode('GO:0045892', type=node_types.PROCESS) results = omnicorpus.get_shared_pmids(disease_node, go_node) assert len(results) == 0
def name_to_drugbank (self, name): response = self.request_concept (name) result = [] seen = {} for r in response: for a in r['aliases']: if a.startswith ("DRUGBANK:"): if not a in seen: result.append ( ( self.get_edge (r, predicate='name_to_drugbank'), KNode(a, node_types.DRUG) ) ) seen[a] = a return list(set(result))
def test_pmid_count(omnicorpus): uberon_node = KNode('UBERON:0013694', type=node_types.ANATOMY) n = omnicorpus.count_pmids(uberon_node) #Checked by hand in the database assert n == 2058
def test_metabolite_to_enzyme_with_syn(rosetta, hmdb): chem = KNode('CHEBI:27732', type=node_types.CHEMICAL_SUBSTANCE) rosetta.synonymizer.synonymize(chem) print(chem.synonyms) results = hmdb.metabolite_to_enzyme(chem) assert len(results) > 0
def sequence_variant_to_gene(self, variant_node): flanking_region_size = 500000 results = [] found_valid_robokop_key = False robokop_ids = variant_node.get_synonyms_by_prefix('ROBO_VARIANT') if not robokop_ids: logger.debug( f'ensembl: robokop variant key not found for variant: {variant_node.id}' ) return results else: try: for robokop_key in robokop_ids: robokop_data = Text.un_curie(robokop_key).split('|') reference_genome = robokop_data[0] if reference_genome == 'HG38': found_valid_robokop_key = True else: continue chromosome = robokop_data[1] start_position = int(robokop_data[2]) end_position = int(robokop_data[3]) except IndexError as e: logger.debug( f'ensembl: robokop variant key not set properly for variant: {variant_node.id} - {robokop_ids[0]}' ) return results if not found_valid_robokop_key: logger.debug( f'ensembl: latest robokop variant key not found for variant: {variant_node.id}' ) return results flanking_min = start_position - flanking_region_size if flanking_min < 0: flanking_min = 0 flanking_max = end_position + flanking_region_size db_conn = self.create_or_connect_to_genes_db() db_cursor = db_conn.cursor() #logger.info(f'looking for genes overlapping {flanking_min}-{flanking_max}') db_cursor.execute( self.gene_range_select_sql, (chromosome, flanking_min, flanking_min, flanking_max, flanking_max, flanking_min, flanking_max)) genes_in_region = db_cursor.fetchall() for gene_id_text, gene_start, gene_end in genes_in_region: #cast this to make neo4j happy gene_id = str(gene_id_text) #logger.info(f'Found matching gene: {gene_id},{gene_start},{gene_end}') gene_node = KNode(f'ENSEMBL:{gene_id}', name=f'{gene_id}', type=node_types.GENE) if start_position < gene_start: distance = gene_start - start_position elif end_position > gene_end: distance = end_position - gene_end else: distance = 0 props = {'distance': distance} edge = self.create_edge(variant_node, gene_node, 'ensembl.sequence_variant_to_gene', variant_node.id, self.var_to_gene_predicate, url=self.gene_batch_url, properties=props) results.append((edge, gene_node)) logger.info( f'ensembl sequence_variant_to_gene found {len(results)} results for {variant_node.id}' ) return results
def test_disease_to_metabolite(hmdb): asthma = KNode('UMLS:C0004096', type=node_types.DISEASE) results = hmdb.disease_to_metabolite(asthma) assert len(results) > 0 node_labels = [node.name for edge, node in results] assert '5-HETE' in node_labels
def sequence_variant_to_sequence_variant(self, variant_node): ld_url = '/ld/human/' options_url = '?r2=0.8' population = '1000GENOMES:phase_3:MXL' return_results = [] # with self.redis.pipeline() as redis_pipe: dbsnp_curie_ids = variant_node.get_synonyms_by_prefix('DBSNP') for dbsnp_curie in dbsnp_curie_ids: variant_id = Text.un_curie(dbsnp_curie) query_url = f'{self.url}{ld_url}{variant_id}/{population}{options_url}' query_response = requests.get( query_url, headers={"Content-Type": "application/json"}) if query_response.status_code == 200: query_json = query_response.json() variant_results = self.parse_ld_variants_from_ensembl( query_json) for variant_info in variant_results: new_variant_id = variant_info[0] r_squared = variant_info[1] props = {'r2': r_squared} new_variant_curie = f'DBSNP:{new_variant_id}' new_variant_node = KNode(new_variant_curie, type=node_types.SEQUENCE_VARIANT) new_variant_node.add_export_labels( [node_types.SEQUENCE_VARIANT]) edge = self.create_edge( variant_node, new_variant_node, 'ensembl.sequence_variant_to_sequence_variant', dbsnp_curie, self.var_to_var_predicate, url=query_url, properties=props) return_results.append((edge, new_variant_node)) # new_rsid_node = None # is_new_dbsnp = False # synonyms = self.cache.get(f'synonymize({new_variant_curie})') # if synonyms is None: # new_rsid_node = KNode(new_variant_curie, name=f'{new_variant_id}', type=node_types.SEQUENCE_VARIANT) # synonyms = self.clingen.get_synonyms_by_other_ids(new_rsid_node) # redis_pipe.set(f'synonymize({new_variant_curie})', pickle.dumps(synonyms)) # is_new_dbsnp = True # caid_count = 0 # caid_node = None # for synonym in synonyms: # if Text.get_curie(synonym.identifier) == 'CAID': # caid_count += 1 # caid_node = KNode(synonym.identifier, name=f'{synonym.label}', type=node_types.SEQUENCE_VARIANT) # edge = self.create_edge(variant_node, caid_node, 'ensembl.sequence_variant_to_sequence_variant', dbsnp_curie, self.var_to_var_predicate, url=query_url, properties=props) # return_results.append((edge, caid_node)) # found_caid = True # if caid_count > 2 we can't cache it easily right now so we skip it and let synonymizer do it later # if caid_count == 1 and is_new_dbsnp: # assume we didn't cache the CAID yet if the dbsnp is new and do it if needed # if self.cache.get(f'synonymize({caid_node.id})') is None: # redis_pipe.set(f'synonymize({caid_node.id})', pickle.dumps(synonyms)) # elif caid_count == 0: # if not new_rsid_node: # new_rsid_node = KNode(new_variant_curie, name=f'{new_variant_id}', type=node_types.SEQUENCE_VARIANT) # edge = self.create_edge(variant_node, new_rsid_node, 'ensembl.sequence_variant_to_sequence_variant', dbsnp_curie, self.var_to_var_predicate, url=query_url, properties=props) # return_results.append((edge, new_rsid_node)) #elif query_response.status_code == 429: # handle the rate limiting by waiting and retrying # else: logger.error( f'Ensembl returned a non-200 response for {variant_node.id}: {query_response.status_code})' ) # redis_pipe.execute() return return_results
def test_metabolite_to_disease(hmdb): hete = KNode('HMDB:HMDB0011134', type=node_types.CHEMICAL_SUBSTANCE) results = hmdb.metabolite_to_disease(hete) assert len(results) > 0 node_labels = [node.name for edge, node in results] assert 'Asthma' in node_labels
def process_associations(self, associations, relationship_id, function, target_node_type, input_identifier, url, input_node, reverse=False): """Given a response from biolink, create our edge and node structures. Sometimes (as in pathway->Genes) biolink returns the query as the object, rather than the subject. reverse=True will handle this case, bringing back the subject of the response, rather than the object. Fortunately, it looks like this is just per-function. We could instead try to see if the subject id matched our input id, etc... if the same function sometimes spun things around.""" edge_nodes = [] for association in associations: # We would like to include edges that are direct links, if we have entity A we've queried for we also get other subjects that have (New_subject)-is_a-> A and relations returned for those, # so we end up having direct relations of subclasses being pushed up to parent classes, so check to see if subject is actually the one we asked for if association['subject']['id'] != input_node.id: continue pubs = [] if 'publications' in association and association[ 'publications'] is not None: for pub in association['publications']: # Sometimes, we get back something like "uniprotkb" instead of a PMID. We don't want it. pubid_prefix = pub['id'][:4].upper() if pubid_prefix == 'PMID': # Sometimes, there is something like: 'id': 'PMID:9557891PMID:9557891' !? # Oh, and even better, sometimes there is this: 'id': 'PMID:12687501:PMID:17918734' # I will refrain from cursing in code. ids = pub['id'].split('PMID:') for n in ids[1:]: while n.endswith(':'): n = n[:-1] pubs.append(f'PMID:{n}') inverse = False if 'relation' in association: inverse = association['relation'].get('inverse', False) if reverse or inverse: source_node = KNode(association['object']['id'], type=target_node_type, name=association['object']['label']) target_node = input_node newnode = source_node else: target_node = KNode(association['object']['id'], type=target_node_type, name=association['object']['label']) source_node = input_node newnode = target_node #Deal with biolink's occasional propensity to return Null relations # This basically happens only with the gene_get_function call, so if that gets fixed, we might be # able to make this a little nicer predicate_id = association['relation']['id'] if (predicate_id is None): predicate_id = relationship_id elif (':' not in predicate_id): if predicate_id in self.label2id: predicate_id = self.label2id[predicate_id] else: logging.getLogger('application').error( f'Relationship Missing: {predicate_id}') predicate_id = relationship_id predicate_label = association['relation']['label'] #now back to the show predicate = LabeledID(identifier=predicate_id, label=predicate_label) try: edge = self.create_edge(source_node, target_node, f'biolink.{function}', input_identifier, predicate, publications=pubs, url=url) except Exception as e: print(e) print(association['publications']) print(pubs) raise e edge_nodes.append((edge, newnode)) return edge_nodes
def test_metabolite_to_pathway(hmdb): hete = KNode('HMDB:HMDB0011134', type=node_types.CHEMICAL_SUBSTANCE) results = hmdb.metabolite_to_pathway(hete) assert len(results) > 0 node_ids = [node.id for edge, node in results] assert 'SMPDB:SMP00710' in node_ids
def test_is_genetic_diabetes_genetic(mondo2): rgd = KNode('MONDO:0015967', name='rare genetic disease', type=node_types.DISEASE) assert mondo2.is_genetic_disease(rgd)
def graph_uniprot_to_hgnc(self, uniprot_symbol): result = self.uniprot_to_hgnc(uniprot_symbol) return [(self.get_edge(r, predicate='synonym'), KNode('HGNC:{0}'.format(r['hgncID'].split(':')[-1]), node_types.GENE)) for r in result]
def test(): m = Mondo(ServiceContext.create_context()) huntington = KNode('OMIM:143100', node_types.DISEASE) print(m.is_genetic_disease(huntington)) print('------')