def _load_answers_into_kg(self, neo4j_results: List[Dict[str, List[Dict[str, any]]]], kg_name: str, qg: QueryGraph, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: log.debug( f"Processing query results for edge {next(qedge_key for qedge_key in qg.edges)}" ) final_kg = QGOrganizedKnowledgeGraph() node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict( neo4j_results[0]) if kg_name == "KG1" else dict() results_table = neo4j_results[0] column_names = [column_name for column_name in results_table] for column_name in column_names: # Load answer nodes into our knowledge graph if column_name.startswith( 'nodes'): # Example column name: 'nodes_n00' column_qnode_key = column_name.replace("nodes_", "", 1) for neo4j_node in results_table.get(column_name): node_key, node = self._convert_neo4j_node_to_trapi_node( neo4j_node, kg_name) final_kg.add_node(node_key, node, column_qnode_key) # Load answer edges into our knowledge graph elif column_name.startswith( 'edges'): # Example column name: 'edges_e01' column_qedge_key = column_name.replace("edges_", "", 1) for neo4j_edge in results_table.get(column_name): edge_key, edge = self._convert_neo4j_edge_to_trapi_edge( neo4j_edge, node_uuid_to_curie_dict, kg_name) final_kg.add_edge(edge_key, edge, column_qedge_key) return final_kg
def _add_answers_to_kg(self, answer_kg: QGOrganizedKnowledgeGraph, reasoner_std_response: Dict[str, any], input_qnode_key: str, output_qnode_key: str, qedge_key: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict(reasoner_std_response['results']) if reasoner_std_response['knowledge_graph']['edges']: remapped_node_keys = dict() log.debug(f"Got results back from BTE for this query " f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)") for node in reasoner_std_response['knowledge_graph']['nodes']: swagger_node = Node() bte_node_key = node.get('id') swagger_node.name = node.get('name') swagger_node.category = eu.convert_to_list(eu.convert_string_to_snake_case(node.get('type'))) # Map the returned BTE qg_ids back to the original qnode_keys in our query graph bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_key) if bte_qg_id == "n0": qnode_key = input_qnode_key elif bte_qg_id == "n1": qnode_key = output_qnode_key else: log.error("Could not map BTE qg_id to ARAX qnode_key", error_code="UnknownQGID") return answer_kg # Find and use the preferred equivalent identifier for this node (if it's an output node) if qnode_key == output_qnode_key: if bte_node_key in remapped_node_keys: swagger_node_key = remapped_node_keys.get(bte_node_key) else: equivalent_curies = [f"{prefix}:{eu.get_curie_local_id(local_id)}" for prefix, local_ids in node.get('equivalent_identifiers').items() for local_id in local_ids] swagger_node_key = self._get_best_equivalent_bte_curie(equivalent_curies, swagger_node.category[0]) remapped_node_keys[bte_node_key] = swagger_node_key else: swagger_node_key = bte_node_key answer_kg.add_node(swagger_node_key, swagger_node, qnode_key) for edge in reasoner_std_response['knowledge_graph']['edges']: swagger_edge = Edge() swagger_edge_key = edge.get("id") swagger_edge.predicate = edge.get('type') swagger_edge.subject = remapped_node_keys.get(edge.get('source_id'), edge.get('source_id')) swagger_edge.object = remapped_node_keys.get(edge.get('target_id'), edge.get('target_id')) swagger_edge.attributes = [Attribute(name="provided_by", value=edge.get('edge_source'), type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="BTE", type=eu.get_attribute_type("is_defined_by"))] # Map the returned BTE qg_id back to the original qedge_key in our query graph bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge_key) if bte_qg_id != "e1": log.error("Could not map BTE qg_id to ARAX qedge_key", error_code="UnknownQGID") return answer_kg answer_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) return answer_kg
def _load_answers_into_kg( self, neo4j_results: List[Dict[str, List[Dict[str, any]]]], kg_name: str, qg: QueryGraph, log: ARAXResponse ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]: log.debug( f"Processing query results for edge {next(qedge_key for qedge_key in qg.edges)}" ) final_kg = QGOrganizedKnowledgeGraph() edge_to_nodes_map = dict() node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict( neo4j_results[0]) if kg_name == "KG1" else dict() results_table = neo4j_results[0] column_names = [column_name for column_name in results_table] for column_name in column_names: # Load answer nodes into our knowledge graph if column_name.startswith( 'nodes'): # Example column name: 'nodes_n00' column_qnode_key = column_name.replace("nodes_", "", 1) for neo4j_node in results_table.get(column_name): swagger_node_key, swagger_node = self._convert_neo4j_node_to_swagger_node( neo4j_node, kg_name) final_kg.add_node(swagger_node_key, swagger_node, column_qnode_key) # Load answer edges into our knowledge graph elif column_name.startswith( 'edges'): # Example column name: 'edges_e01' column_qedge_key = column_name.replace("edges_", "", 1) for neo4j_edge in results_table.get(column_name): swagger_edge_key, swagger_edge = self._convert_neo4j_edge_to_swagger_edge( neo4j_edge, node_uuid_to_curie_dict, kg_name) # Record which of this edge's nodes correspond to which qnode_key if swagger_edge_key not in edge_to_nodes_map: edge_to_nodes_map[swagger_edge_key] = dict() for qnode_key in qg.nodes: edge_to_nodes_map[swagger_edge_key][ qnode_key] = neo4j_edge.get(qnode_key) # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, column_qedge_key) return final_kg, edge_to_nodes_map
def answer_single_node_query( self, single_node_qg: QueryGraph) -> QGOrganizedKnowledgeGraph: kg_name = self.kg_name use_synonyms = self.use_synonyms log = self.response final_kg = QGOrganizedKnowledgeGraph() single_node_qg = eu.make_qg_use_old_types( single_node_qg) # Temporary patch until we switch to KG2.5.1 qnode_key = next(qnode_key for qnode_key in single_node_qg.nodes) qnode = single_node_qg.nodes[qnode_key] # Convert qnode curies as needed (either to synonyms or to canonical versions) if qnode.id: if use_synonyms and kg_name == "KG1": qnode.id = eu.get_curie_synonyms(qnode.id, log) qnode.category = [ ] # Important to clear this, otherwise results are limited (#889) elif kg_name == "KG2c": qnode.id = eu.get_canonical_curies_list(qnode.id, log) qnode.category = [ ] # Important to clear this to avoid discrepancies in types for particular concepts # Build and run a cypher query to get this node/nodes where_clause = f"{qnode_key}.id='{qnode.id}'" if type( qnode.id) is str else f"{qnode_key}.id in {qnode.id}" cypher_query = f"MATCH {self._get_cypher_for_query_node(qnode_key, single_node_qg, kg_name)} WHERE {where_clause} RETURN {qnode_key}" log.info( f"Sending cypher query for node {qnode_key} to {kg_name} neo4j") results = self._run_cypher_query(cypher_query, kg_name, log) # Load the results into swagger object model and add to our answer knowledge graph for result in results: neo4j_node = result.get(qnode_key) swagger_node_key, swagger_node = self._convert_neo4j_node_to_swagger_node( neo4j_node, kg_name) final_kg.add_node(swagger_node_key, swagger_node, qnode_key) # TODO: remove this patch once we switch to KG2.5.0! eu.convert_node_and_edge_types_to_new_format(final_kg) return final_kg
def _answer_single_node_query_using_neo4j(self, qnode_key: str, qg: QueryGraph, kg_name: str, log: ARAXResponse): qnode = qg.nodes[qnode_key] answer_kg = QGOrganizedKnowledgeGraph() # Build and run a cypher query to get this node/nodes where_clause = f"{qnode_key}.id='{qnode.id}'" if type( qnode.id) is str else f"{qnode_key}.id in {qnode.id}" cypher_query = f"MATCH {self._get_cypher_for_query_node(qnode_key, qg)} WHERE {where_clause} RETURN {qnode_key}" log.info( f"Sending cypher query for node {qnode_key} to {kg_name} neo4j") results = self._run_cypher_query(cypher_query, kg_name, log) # Load the results into API object model and add to our answer knowledge graph for result in results: neo4j_node = result.get(qnode_key) node_key, node = self._convert_neo4j_node_to_trapi_node( neo4j_node, kg_name) answer_kg.add_node(node_key, node, qnode_key) return answer_kg
def _load_plover_answer_into_object_model(self, plover_answer: Dict[str, Dict[str, Union[set, dict]]], log: ARAXResponse) -> QGOrganizedKnowledgeGraph: answer_kg = QGOrganizedKnowledgeGraph() # Load returned nodes into TRAPI object model for qnode_key, nodes in plover_answer["nodes"].items(): num_nodes = len(nodes) log.debug(f"Loading {num_nodes} {qnode_key} nodes into TRAPI object model") start = time.time() for node_key, node_tuple in nodes.items(): node = self._convert_kg2c_plover_node_to_trapi_node(node_tuple) answer_kg.add_node(node_key, node, qnode_key) log.debug(f"Loading {num_nodes} {qnode_key} nodes into TRAPI object model took " f"{round(time.time() - start, 2)} seconds") # Load returned edges into TRAPI object model for qedge_key, edges in plover_answer["edges"].items(): num_edges = len(edges) log.debug(f"Loading {num_edges} edges into TRAPI object model") start = time.time() for edge_key, edge_tuple in edges.items(): edge = self._convert_kg2c_plover_edge_to_trapi_edge(edge_tuple) answer_kg.add_edge(edge_key, edge, qedge_key) log.debug(f"Loading {num_edges} {qedge_key} edges into TRAPI object model took " f"{round(time.time() - start, 2)} seconds") return answer_kg
def answer_one_hop_query( self, query_graph: QueryGraph ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]: """ This function answers a one-hop (single-edge) query using NGD (with the assistance of KG2). :param query_graph: A Reasoner API standard query graph. :return: A tuple containing: 1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as results for the query. (Dictionary version, organized by QG IDs.) 2. a map of which nodes fulfilled which qnode_keys for each edge. Example: {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}} """ log = self.response final_kg = QGOrganizedKnowledgeGraph() edge_to_nodes_map = dict() # Verify this is a valid one-hop query graph self._verify_one_hop_query_graph_is_valid(query_graph, log) if log.status != 'OK': return final_kg, edge_to_nodes_map # Find potential answers using KG2 log.debug(f"Finding potential answers using KG2") qedge_key = next(qedge_key for qedge_key in query_graph.edges) qedge = query_graph.edges[qedge_key] source_qnode_key = qedge.subject target_qnode_key = qedge.object source_qnode = query_graph.nodes[source_qnode_key] target_qnode = query_graph.nodes[target_qnode_key] qedge_params_str = ", ".join( list( filter(None, [ f"key={qedge_key}", f"subject={source_qnode_key}", f"object={target_qnode_key}", self._get_dsl_qedge_type_str(qedge) ]))) source_params_str = ", ".join( list( filter(None, [ f"key={source_qnode_key}", self._get_dsl_qnode_curie_str(source_qnode), self._get_dsl_qnode_category_str(source_qnode) ]))) target_params_str = ", ".join( list( filter(None, [ f"key={target_qnode_key}", self._get_dsl_qnode_curie_str(target_qnode), self._get_dsl_qnode_category_str(target_qnode) ]))) actions_list = [ f"add_qnode({source_params_str})", f"add_qnode({target_params_str})", f"add_qedge({qedge_params_str})", f"expand(kp=ARAX/KG2)", f"return(message=true, store=false)", ] kg2_response, kg2_message = self._run_arax_query(actions_list, log) if log.status != 'OK': return final_kg, edge_to_nodes_map # Go through those answers from KG2 and calculate ngd for each edge log.debug(f"Calculating NGD between each potential node pair") kg2_answer_kg = kg2_message.knowledge_graph cngd = ComputeNGD(log, kg2_message, None) cngd.load_curie_to_pmids_data(kg2_answer_kg.nodes) kg2_edge_ngd_map = dict() for kg2_edge_key, kg2_edge in kg2_answer_kg.edges.items(): kg2_node_1_key = kg2_edge.subject kg2_node_2_key = kg2_edge.object kg2_node_1 = kg2_answer_kg.nodes.get( kg2_node_1_key ) # These are already canonicalized (default behavior) kg2_node_2 = kg2_answer_kg.nodes.get(kg2_node_2_key) # Figure out which node corresponds to source qnode (don't necessarily match b/c query was bidirectional) if source_qnode_key in kg2_node_1.qnode_keys and target_qnode_key in kg2_node_2.qnode_keys: ngd_subject = kg2_node_1_key ngd_object = kg2_node_2_key else: ngd_subject = kg2_node_2_key ngd_object = kg2_node_1_key ngd_value = cngd.calculate_ngd_fast(ngd_subject, ngd_object) kg2_edge_ngd_map[kg2_edge_key] = { "ngd_value": ngd_value, "subject": ngd_subject, "object": ngd_object } # Create edges for those from KG2 found to have a low enough ngd value threshold = 0.5 log.debug( f"Creating edges between node pairs with NGD below the threshold ({threshold})" ) for kg2_edge_key, ngd_info_dict in kg2_edge_ngd_map.items(): ngd_value = ngd_info_dict['ngd_value'] if ngd_value is not None and ngd_value < threshold: # TODO: Make determination of the threshold much more sophisticated subject = ngd_info_dict["subject"] object = ngd_info_dict["object"] ngd_edge_key, ngd_edge = self._create_ngd_edge( ngd_value, subject, object) ngd_source_node_key, ngd_source_node = self._create_ngd_node( ngd_edge.subject, kg2_answer_kg.nodes.get(ngd_edge.subject)) ngd_target_node_key, ngd_target_node = self._create_ngd_node( ngd_edge.object, kg2_answer_kg.nodes.get(ngd_edge.object)) final_kg.add_edge(ngd_edge_key, ngd_edge, qedge_key) final_kg.add_node(ngd_source_node_key, ngd_source_node, source_qnode_key) final_kg.add_node(ngd_target_node_key, ngd_target_node, target_qnode_key) edge_to_nodes_map[ngd_edge_key] = { source_qnode_key: ngd_source_node_key, target_qnode_key: ngd_target_node_key } return final_kg, edge_to_nodes_map
def _answer_query_using_CHP_client( self, query_graph: QueryGraph, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: qedge_key = next(qedge_key for qedge_key in query_graph.edges) log.debug( f"Processing query results for edge {qedge_key} by using CHP client" ) final_kg = QGOrganizedKnowledgeGraph() gene_label_list = ['gene'] drug_label_list = ['drug', 'chemicalsubstance'] # use for checking the requirement source_pass_nodes = None source_category = None target_pass_nodes = None target_category = None qedge = query_graph.edges[qedge_key] source_qnode_key = qedge.subject target_qnode_key = qedge.object source_qnode = query_graph.nodes[source_qnode_key] target_qnode = query_graph.nodes[target_qnode_key] # check if both ends of edge have no curie if (source_qnode.id is None) and (target_qnode.id is None): log.error(f"Both ends of edge {qedge_key} are None", error_code="BadEdge") return final_kg # check if the query nodes are drug or disease if source_qnode.id is not None: if type(source_qnode.id) is str: source_pass_nodes = [source_qnode.id] else: source_pass_nodes = source_qnode.id has_error, pass_nodes, not_pass_nodes = self._check_id( source_qnode.id, log) if has_error: return final_kg else: if len(not_pass_nodes) == 0 and len(pass_nodes) != 0: source_pass_nodes = pass_nodes elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0: source_pass_nodes = pass_nodes if len(not_pass_nodes) == 1: log.warning( f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client" ) else: log.warning( f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client" ) else: if type(source_qnode.id) is str: log.error( f"The curie id of {source_qnode.id} is not allowable based on CHP client", error_code="NotAllowable") return final_kg else: log.error( f"The curie ids of {source_qnode.id} are not allowable based on CHP client", error_code="NotAllowable") return final_kg else: category = source_qnode.category[0].replace( 'biolink:', '').replace('_', '').lower() source_category = category if (category in drug_label_list) or (category in gene_label_list): source_category = category else: log.error( f"The category of query node {source_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene", error_code="CategoryError") return final_kg if target_qnode.id is not None: if type(target_qnode.id) is str: target_pass_nodes = [target_qnode.id] else: target_pass_nodes = target_qnode.id has_error, pass_nodes, not_pass_nodes = self._check_id( target_qnode.id, log) if has_error: return final_kg else: if len(not_pass_nodes) == 0 and len(pass_nodes) != 0: target_pass_nodes = pass_nodes elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0: target_pass_nodes = pass_nodes if len(not_pass_nodes) == 1: log.warning( f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client" ) else: log.warning( f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client" ) else: if type(target_qnode.id) is str: log.error( f"The curie id of {target_qnode.id} is not allowable based on CHP client", error_code="CategoryError") return final_kg else: log.error( f"The curie ids of {target_qnode.id} are not allowable based on CHP client", error_code="CategoryError") return final_kg else: category = target_qnode.category[0].replace( 'biolink:', '').replace('_', '').lower() target_category = category if (category in drug_label_list) or (category in gene_label_list): target_category = category else: log.error( f"The category of query node {target_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene", error_code="CategoryError") return final_kg if (source_pass_nodes is None) and (target_pass_nodes is None): return final_kg elif (source_pass_nodes is not None) and (target_pass_nodes is not None): source_dict = dict() target_dict = dict() if source_pass_nodes[0] in self.allowable_drug_curies: source_category_temp = 'drug' else: source_category_temp = 'gene' if target_pass_nodes[0] in self.allowable_drug_curies: target_category_temp = 'drug' else: target_category_temp = 'gene' if source_category_temp == target_category_temp: log.error( f"The query nodes in both ends of edge are the same type which is {source_category_temp}", error_code="CategoryError") return final_kg else: for (source_curie, target_curie) in itertools.product( source_pass_nodes, target_pass_nodes): if source_category_temp == 'drug': source_curie_temp = source_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') # Let's build a simple single query q = build_query(genes=[target_curie], therapeutic=source_curie_temp, disease='MONDO:0007254', outcome=('EFO:0000714', '>=', self.CHP_survival_threshold)) response = self.client.query(q) max_probability = self.client.get_outcome_prob( response) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( target_curie, source_curie, "paired_with", max_probability) else: target_curie_temp = target_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') # Let's build a simple single query q = build_query(genes=[source_curie], therapeutic=target_curie_temp, disease='MONDO:0007254', outcome=('EFO:0000714', '>=', self.CHP_survival_threshold)) response = self.client.query(q) max_probability = self.client.get_outcome_prob( response) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( source_curie, target_curie, "paired_with", max_probability) source_dict[source_curie] = source_qnode_key target_dict[target_curie] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) # Add the nodes to our answer knowledge graph if len(source_dict) != 0: for source_curie in source_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( source_curie) final_kg.add_node(swagger_node_key, swagger_node, source_dict[source_curie]) if len(target_dict) != 0: for target_curie in target_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( target_curie) final_kg.add_node(swagger_node_key, swagger_node, target_dict[target_curie]) return final_kg elif source_pass_nodes is not None: source_dict = dict() target_dict = dict() if source_pass_nodes[0] in self.allowable_drug_curies: source_category_temp = 'drug' else: source_category_temp = 'gene' if target_category in drug_label_list: target_category_temp = 'drug' else: target_category_temp = 'gene' if source_category_temp == target_category_temp: log.error( f"The query nodes in both ends of edge are the same type which is {source_category_temp}", error_code="CategoryError") return final_kg else: if source_category_temp == 'drug': for source_curie in source_pass_nodes: genes = [ curie for curie in self.allowable_gene_curies if self.synonymizer.get_canonical_curies(curie) [curie] is not None and target_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie, return_all_categories=True) [curie]['all_categories'].keys()) ] ] therapeutic = source_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for gene in genes: queries.append( build_query( genes=[gene], therapeutic=therapeutic, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, gene in zip(res["message"], genes): prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( gene, source_curie, "paired_with", prob) source_dict[source_curie] = source_qnode_key target_dict[gene] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) else: for source_curie in source_pass_nodes: genes = [source_curie] therapeutic = [ curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:') for curie in self.allowable_drug_curies if self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')) [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')] is not None and target_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'), return_all_categories=True)[ curie.replace( 'CHEMBL:', 'CHEMBL.COMPOUND:')] ['all_categories'].keys()) ] ] disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for drug in therapeutic: queries.append( build_query( genes=genes, therapeutic=drug, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, drug in zip(res["message"], therapeutic): drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:') prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( source_curie, drug, "paired_with", prob) source_dict[source_curie] = source_qnode_key target_dict[drug] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) # Add the nodes to our answer knowledge graph if len(source_dict) != 0: for source_curie in source_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( source_curie) final_kg.add_node(swagger_node_key, swagger_node, source_dict[source_curie]) if len(target_dict) != 0: for target_curie in target_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( target_curie) final_kg.add_node(swagger_node_key, swagger_node, target_dict[target_curie]) return final_kg else: source_dict = dict() target_dict = dict() if target_pass_nodes[0] in self.allowable_drug_curies: target_category_temp = 'drug' else: target_category_temp = 'gene' if source_category in drug_label_list: source_category_temp = 'drug' else: source_category_temp = 'gene' if source_category_temp == target_category_temp: log.error( f"The query nodes in both ends of edge are the same type which is {source_category_temp}", error_code="CategoryError") return final_kg else: if target_category_temp == 'drug': for target_curie in target_pass_nodes: genes = [ curie for curie in self.allowable_gene_curies if self.synonymizer.get_canonical_curies(curie) [curie] is not None and source_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie, return_all_categories=True) [curie]['all_categories'].keys()) ] ] therapeutic = target_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for gene in genes: queries.append( build_query( genes=[gene], therapeutic=therapeutic, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, gene in zip(res["message"], genes): prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( gene, target_curie, "paired_with", prob) source_dict[gene] = source_qnode_key target_dict[target_curie] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) else: for target_curie in target_pass_nodes: genes = [target_curie] therapeutic = [ curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:') for curie in self.allowable_drug_curies if self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')) [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')] is not None and source_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'), return_all_categories=True)[ curie.replace( 'CHEMBL:', 'CHEMBL.COMPOUND:')] ['all_categories'].keys()) ] ] disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for drug in therapeutic: queries.append( build_query( genes=genes, therapeutic=drug, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, drug in zip(res["message"], therapeutic): drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:') prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( target_curie, drug, "paired_with", prob) source_dict[drug] = source_qnode_key target_dict[target_curie] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) # Add the nodes to our answer knowledge graph if len(source_dict) != 0: for source_curie in source_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( source_curie) final_kg.add_node(swagger_node_key, swagger_node, source_dict[source_curie]) if len(target_dict) != 0: for target_curie in target_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( target_curie) final_kg.add_node(swagger_node_key, swagger_node, target_dict[target_curie]) return final_kg
def answer_one_hop_query( self, query_graph: QueryGraph) -> QGOrganizedKnowledgeGraph: """ This function answers a one-hop (single-edge) query using NGD (with the assistance of KG2). :param query_graph: A TRAPI query graph. :return: An (almost) TRAPI knowledge graph containing all of the nodes and edges returned as results for the query. (Organized by QG IDs.) """ log = self.response final_kg = QGOrganizedKnowledgeGraph() # Verify this is a valid one-hop query graph self._verify_one_hop_query_graph_is_valid(query_graph, log) if log.status != 'OK': return final_kg qedge_key = next(qedge_key for qedge_key in query_graph.edges) qedge = query_graph.edges[qedge_key] if qedge.predicates and not set(qedge.predicates).intersection( self.accepted_qedge_predicates): log.error( f"NGD can only expand qedges with these predicates: {self.accepted_qedge_predicates}. QEdge" f" {qedge_key}'s predicate is: {qedge.predicates}", error_code="UnsupportedQG") return final_kg source_qnode_key = qedge.subject target_qnode_key = qedge.object # Find potential answers using KG2 log.debug(f"Finding potential answers using KG2") modified_qg = copy.deepcopy(query_graph) for qedge in modified_qg.edges.values(): qedge.predicates = None request_body = {"message": {"query_graph": modified_qg.to_dict()}} kg2_response, kg2_message = self._run_arax_query(request_body, log) if log.status != 'OK': return final_kg # Go through those answers from KG2 and calculate ngd for each edge log.debug(f"Calculating NGD between each potential node pair") kg2_answer_kg = kg2_message.knowledge_graph cngd = ComputeNGD(log, kg2_message, None) cngd.load_curie_to_pmids_data(kg2_answer_kg.nodes) kg2_edge_ngd_map = dict() for kg2_edge_key, kg2_edge in kg2_answer_kg.edges.items(): kg2_node_1_key = kg2_edge.subject kg2_node_2_key = kg2_edge.object kg2_node_1 = kg2_answer_kg.nodes.get( kg2_node_1_key ) # These are already canonicalized (default behavior) kg2_node_2 = kg2_answer_kg.nodes.get(kg2_node_2_key) # Figure out which node corresponds to source qnode (don't necessarily match b/c query was bidirectional) if source_qnode_key in kg2_node_1.qnode_keys and target_qnode_key in kg2_node_2.qnode_keys: ngd_subject = kg2_node_1_key ngd_object = kg2_node_2_key else: ngd_subject = kg2_node_2_key ngd_object = kg2_node_1_key ngd_value, pmid_set = cngd.calculate_ngd_fast( ngd_subject, ngd_object) kg2_edge_ngd_map[kg2_edge_key] = { "ngd_value": ngd_value, "subject": ngd_subject, "object": ngd_object, "pmids": [f"PMID:{pmid}" for pmid in pmid_set] } # Create edges for those from KG2 found to have a low enough ngd value threshold = 0.5 log.debug( f"Creating edges between node pairs with NGD below the threshold ({threshold})" ) for kg2_edge_key, ngd_info_dict in kg2_edge_ngd_map.items(): ngd_value = ngd_info_dict['ngd_value'] if ngd_value is not None and ngd_value < threshold: # TODO: Make determination of the threshold much more sophisticated subject = ngd_info_dict["subject"] object = ngd_info_dict["object"] pmid_list = ngd_info_dict["pmids"] ngd_edge_key, ngd_edge = self._create_ngd_edge( ngd_value, subject, object, pmid_list) ngd_source_node_key, ngd_source_node = self._create_ngd_node( ngd_edge.subject, kg2_answer_kg.nodes.get(ngd_edge.subject)) ngd_target_node_key, ngd_target_node = self._create_ngd_node( ngd_edge.object, kg2_answer_kg.nodes.get(ngd_edge.object)) final_kg.add_edge(ngd_edge_key, ngd_edge, qedge_key) final_kg.add_node(ngd_source_node_key, ngd_source_node, source_qnode_key) final_kg.add_node(ngd_target_node_key, ngd_target_node, target_qnode_key) return final_kg
def _grab_nodes_and_edges_from_sqlite( self, plover_answer: Dict[str, Dict[str, Set[Union[str, int]]]], kg_name: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: # Get connected to the local sqlite database (look up its path using database manager-friendly method) path_list = os.path.realpath(__file__).split(os.path.sep) rtx_index = path_list.index("RTX") rtxc = RTXConfiguration() sqlite_dir_path = os.path.sep.join([ *path_list[:(rtx_index + 1)], 'code', 'ARAX', 'KnowledgeSources', 'KG2c' ]) sqlite_name = rtxc.kg2c_sqlite_path.split('/')[-1] sqlite_file_path = f"{sqlite_dir_path}{os.path.sep}{sqlite_name}" connection = sqlite3.connect(sqlite_file_path) cursor = connection.cursor() answer_kg = QGOrganizedKnowledgeGraph() # Grab the node objects from sqlite corresponding to the returned node IDs num_nodes = sum( [len(nodes) for nodes in plover_answer["nodes"].values()]) start = time.time() for qnode_key, node_keys in plover_answer["nodes"].items(): node_keys_str = "','".join( node_keys ) # SQL wants ('node1', 'node2') format for string lists sql_query = f"SELECT N.node " \ f"FROM nodes AS N " \ f"WHERE N.id IN ('{node_keys_str}')" log.debug( f"Looking up {len(plover_answer['nodes'][qnode_key])} returned {qnode_key} node IDs in KG2c sqlite" ) cursor.execute(sql_query) rows = cursor.fetchall() for row in rows: node_as_dict = ujson.loads(row[0]) node_key, node = self._convert_neo4j_node_to_trapi_node( node_as_dict, kg_name) answer_kg.add_node(node_key, node, qnode_key) log.debug( f"Grabbing {num_nodes} nodes from sqlite and loading into object model took " f"{round(time.time() - start, 2)} seconds") # Grab the edge objects from sqlite corresponding to the returned edge IDs num_edges = sum( [len(edges) for edges in plover_answer["edges"].values()]) start = time.time() for qedge_key, edge_keys in plover_answer["edges"].items(): edge_keys_str = ",".join( str(edge_key) for edge_key in edge_keys) # SQL wants (1, 2) format int lists sql_query = f"SELECT E.edge " \ f"FROM edges AS E " \ f"WHERE E.id IN ({edge_keys_str})" log.debug( f"Looking up {len(plover_answer['edges'][qedge_key])} returned {qedge_key} edge IDs in KG2c sqlite" ) cursor.execute(sql_query) rows = cursor.fetchall() for row in rows: edge_as_dict = ujson.loads(row[0]) edge_key, edge = self._convert_neo4j_edge_to_trapi_edge( edge_as_dict, dict(), kg_name) answer_kg.add_edge(edge_key, edge, qedge_key) log.debug( f"Grabbing {num_edges} edges from sqlite and loading into object model took " f"{round(time.time() - start, 2)} seconds") cursor.close() connection.close() return answer_kg
def answer_one_hop_query( self, query_graph: QueryGraph ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]: """ This function answers a one-hop (single-edge) query using the Genetics Provider. :param query_graph: A Reasoner API standard query graph. :return: A tuple containing: 1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as results for the query. (Dictionary version, organized by QG IDs.) 2. a map of which nodes fulfilled which qnode_keys for each edge. Example: {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}} """ log = self.response include_all_scores = self.response.data['parameters'][ 'include_all_scores'] final_kg = QGOrganizedKnowledgeGraph() edge_to_nodes_map = dict() query_graph = eu.make_qg_use_old_types( query_graph ) # Temporary patch until TRAPI 1.0 KP endpoint is ready # Verify this is a valid one-hop query graph and tweak its contents as needed for this KP self._verify_one_hop_query_graph_is_valid(query_graph, log) if log.status != 'OK': return final_kg, edge_to_nodes_map modified_query_graph = self._pre_process_query_graph(query_graph, log) if log.status != 'OK': return final_kg, edge_to_nodes_map qedge = next(qedge for qedge in modified_query_graph.edges.values()) source_qnode_key = qedge.subject target_qnode_key = qedge.object # Answer the query using the KP and load its answers into our Swagger model json_response = self._send_query_to_kp(modified_query_graph, log) returned_kg = json_response.get('knowledge_graph') if not returned_kg: log.warning( f"No KG is present in the response from {self.kp_name}") else: # Build a map of node/edge IDs to qnode/qedge IDs qg_id_mappings = self._get_qg_id_mappings_from_results( json_response['results']) unknown_scores_encountered = set() # Populate our final KG with nodes and edges for returned_edge in returned_kg['edges']: # Skip edges missing a source and/or target ID (have encountered these before) if not returned_edge['source_id'] or not returned_edge[ 'target_id']: log.warning( f"Edge returned from GeneticsKP is lacking a subject and/or object: {returned_edge}." f" Will skip adding this edge to the KG.") else: if returned_edge[ 'score_name'] not in self.score_type_lookup: unknown_scores_encountered.add( returned_edge['score_name']) # Always include edges for integrated scores, but only include magma edges if that flag is set if include_all_scores or returned_edge[ 'score_name'] == self.magma_score_name: kp_edge_key, swagger_edge = self._create_swagger_edge_from_kp_edge( returned_edge) swagger_edge_key = self._create_unique_edge_key( swagger_edge ) # Convert to an ID that's unique for us for qedge_key in qg_id_mappings['edges'][kp_edge_key]: final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) edge_to_nodes_map[swagger_edge_key] = { source_qnode_key: swagger_edge.subject, target_qnode_key: swagger_edge.object } if unknown_scores_encountered: log.warning( f"Encountered unknown score(s) from {self.kp_name}: {unknown_scores_encountered}. " f"Not sure what data type to assign these.") for returned_node in returned_kg['nodes']: if returned_node[ 'id']: # Skip any nodes with 'None' for their ID (see discussion in #1154) swagger_node_key, swagger_node = self._create_swagger_node_from_kp_node( returned_node) for qnode_key in qg_id_mappings['nodes'][swagger_node_key]: final_kg.add_node(swagger_node_key, swagger_node, qnode_key) else: log.warning( f"Node returned from {self.kp_name} is lacking an ID: {returned_node}." f" Will skip adding this node to the KG.") return final_kg, edge_to_nodes_map