def _add_answers_to_kg(self, answer_kg: QGOrganizedKnowledgeGraph, reasoner_std_response: Dict[str, any], input_qnode_key: str, output_qnode_key: str, qedge_key: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict(reasoner_std_response['results']) if reasoner_std_response['knowledge_graph']['edges']: remapped_node_keys = dict() log.debug(f"Got results back from BTE for this query " f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)") for node in reasoner_std_response['knowledge_graph']['nodes']: swagger_node = Node() bte_node_key = node.get('id') swagger_node.name = node.get('name') swagger_node.category = eu.convert_to_list(eu.convert_string_to_snake_case(node.get('type'))) # Map the returned BTE qg_ids back to the original qnode_keys in our query graph bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_key) if bte_qg_id == "n0": qnode_key = input_qnode_key elif bte_qg_id == "n1": qnode_key = output_qnode_key else: log.error("Could not map BTE qg_id to ARAX qnode_key", error_code="UnknownQGID") return answer_kg # Find and use the preferred equivalent identifier for this node (if it's an output node) if qnode_key == output_qnode_key: if bte_node_key in remapped_node_keys: swagger_node_key = remapped_node_keys.get(bte_node_key) else: equivalent_curies = [f"{prefix}:{eu.get_curie_local_id(local_id)}" for prefix, local_ids in node.get('equivalent_identifiers').items() for local_id in local_ids] swagger_node_key = self._get_best_equivalent_bte_curie(equivalent_curies, swagger_node.category[0]) remapped_node_keys[bte_node_key] = swagger_node_key else: swagger_node_key = bte_node_key answer_kg.add_node(swagger_node_key, swagger_node, qnode_key) for edge in reasoner_std_response['knowledge_graph']['edges']: swagger_edge = Edge() swagger_edge_key = edge.get("id") swagger_edge.predicate = edge.get('type') swagger_edge.subject = remapped_node_keys.get(edge.get('source_id'), edge.get('source_id')) swagger_edge.object = remapped_node_keys.get(edge.get('target_id'), edge.get('target_id')) swagger_edge.attributes = [Attribute(name="provided_by", value=edge.get('edge_source'), type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="BTE", type=eu.get_attribute_type("is_defined_by"))] # Map the returned BTE qg_id back to the original qedge_key in our query graph bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge_key) if bte_qg_id != "e1": log.error("Could not map BTE qg_id to ARAX qedge_key", error_code="UnknownQGID") return answer_kg answer_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) return answer_kg
def answer_single_node_query( self, single_node_qg: QueryGraph) -> QGOrganizedKnowledgeGraph: kg_name = self.kg_name use_synonyms = self.use_synonyms log = self.response final_kg = QGOrganizedKnowledgeGraph() single_node_qg = eu.make_qg_use_old_types( single_node_qg) # Temporary patch until we switch to KG2.5.1 qnode_key = next(qnode_key for qnode_key in single_node_qg.nodes) qnode = single_node_qg.nodes[qnode_key] # Convert qnode curies as needed (either to synonyms or to canonical versions) if qnode.id: if use_synonyms and kg_name == "KG1": qnode.id = eu.get_curie_synonyms(qnode.id, log) qnode.category = [ ] # Important to clear this, otherwise results are limited (#889) elif kg_name == "KG2c": qnode.id = eu.get_canonical_curies_list(qnode.id, log) qnode.category = [ ] # Important to clear this to avoid discrepancies in types for particular concepts # Build and run a cypher query to get this node/nodes where_clause = f"{qnode_key}.id='{qnode.id}'" if type( qnode.id) is str else f"{qnode_key}.id in {qnode.id}" cypher_query = f"MATCH {self._get_cypher_for_query_node(qnode_key, single_node_qg, kg_name)} WHERE {where_clause} RETURN {qnode_key}" log.info( f"Sending cypher query for node {qnode_key} to {kg_name} neo4j") results = self._run_cypher_query(cypher_query, kg_name, log) # Load the results into swagger object model and add to our answer knowledge graph for result in results: neo4j_node = result.get(qnode_key) swagger_node_key, swagger_node = self._convert_neo4j_node_to_swagger_node( neo4j_node, kg_name) final_kg.add_node(swagger_node_key, swagger_node, qnode_key) # TODO: remove this patch once we switch to KG2.5.0! eu.convert_node_and_edge_types_to_new_format(final_kg) return final_kg
def _load_answers_into_kg(self, neo4j_results: List[Dict[str, List[Dict[str, any]]]], kg_name: str, qg: QueryGraph, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: log.debug( f"Processing query results for edge {next(qedge_key for qedge_key in qg.edges)}" ) final_kg = QGOrganizedKnowledgeGraph() node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict( neo4j_results[0]) if kg_name == "KG1" else dict() results_table = neo4j_results[0] column_names = [column_name for column_name in results_table] for column_name in column_names: # Load answer nodes into our knowledge graph if column_name.startswith( 'nodes'): # Example column name: 'nodes_n00' column_qnode_key = column_name.replace("nodes_", "", 1) for neo4j_node in results_table.get(column_name): node_key, node = self._convert_neo4j_node_to_trapi_node( neo4j_node, kg_name) final_kg.add_node(node_key, node, column_qnode_key) # Load answer edges into our knowledge graph elif column_name.startswith( 'edges'): # Example column name: 'edges_e01' column_qedge_key = column_name.replace("edges_", "", 1) for neo4j_edge in results_table.get(column_name): edge_key, edge = self._convert_neo4j_edge_to_trapi_edge( neo4j_edge, node_uuid_to_curie_dict, kg_name) final_kg.add_edge(edge_key, edge, column_qedge_key) return final_kg
def _answer_single_node_query_using_neo4j(self, qnode_key: str, qg: QueryGraph, kg_name: str, log: ARAXResponse): qnode = qg.nodes[qnode_key] answer_kg = QGOrganizedKnowledgeGraph() # Build and run a cypher query to get this node/nodes where_clause = f"{qnode_key}.id='{qnode.id}'" if type( qnode.id) is str else f"{qnode_key}.id in {qnode.id}" cypher_query = f"MATCH {self._get_cypher_for_query_node(qnode_key, qg)} WHERE {where_clause} RETURN {qnode_key}" log.info( f"Sending cypher query for node {qnode_key} to {kg_name} neo4j") results = self._run_cypher_query(cypher_query, kg_name, log) # Load the results into API object model and add to our answer knowledge graph for result in results: neo4j_node = result.get(qnode_key) node_key, node = self._convert_neo4j_node_to_trapi_node( neo4j_node, kg_name) answer_kg.add_node(node_key, node, qnode_key) return answer_kg
def answer_one_hop_query(self, query_graph: QueryGraph) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]: """ This function answers a one-hop (single-edge) query using BTE. :param query_graph: A Reasoner API standard query graph. :return: A tuple containing: 1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as results for the query. (Dictionary version, organized by QG IDs.) 2. a map of which nodes fulfilled which qnode_keys for each edge. Example: {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}} """ enforce_directionality = self.response.data['parameters'].get('enforce_directionality') use_synonyms = self.response.data['parameters'].get('use_synonyms') log = self.response answer_kg = QGOrganizedKnowledgeGraph() edge_to_nodes_map = dict() valid_bte_inputs_dict = self._get_valid_bte_inputs_dict() query_graph = eu.make_qg_use_old_types(query_graph) # Temporary patch until KP is TRAPI 1.0 compliant # Validate our input to make sure it will work with BTE input_qnode_key, output_qnode_key = self._validate_and_pre_process_input(qg=query_graph, valid_bte_inputs_dict=valid_bte_inputs_dict, enforce_directionality=enforce_directionality, use_synonyms=use_synonyms, log=log) if log.status != 'OK': return answer_kg, edge_to_nodes_map input_qnode = query_graph.nodes[input_qnode_key] output_qnode = query_graph.nodes[output_qnode_key] # Use BTE to answer the query answer_kg, accepted_curies = self._answer_query_using_bte(input_qnode_key=input_qnode_key, output_qnode_key=output_qnode_key, qg=query_graph, answer_kg=answer_kg, valid_bte_inputs_dict=valid_bte_inputs_dict, log=log) if log.status != 'OK': return answer_kg, edge_to_nodes_map # Hack to achieve a curie-to-curie query, if necessary if eu.qg_is_fulfilled(query_graph, answer_kg) and input_qnode.id and output_qnode.id: answer_kg = self._prune_answers_to_achieve_curie_to_curie_query(answer_kg, output_qnode_key, query_graph) # Report our findings if eu.qg_is_fulfilled(query_graph, answer_kg): answer_kg = eu.switch_kg_to_arax_curie_format(answer_kg) edge_to_nodes_map = self._create_edge_to_nodes_map(answer_kg, input_qnode_key, output_qnode_key) elif not accepted_curies: log.warning(f"BTE could not accept any of the input curies. Valid curie prefixes for BTE are: " f"{valid_bte_inputs_dict['curie_prefixes']}") return answer_kg, edge_to_nodes_map
def _answer_query_using_neo4j( self, qg: QueryGraph, kg_name: str, qedge_key: str, enforce_directionality: bool, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: answer_kg = QGOrganizedKnowledgeGraph() cypher_query = self._convert_one_hop_query_graph_to_cypher_query( qg, enforce_directionality, log) if log.status != 'OK': return answer_kg neo4j_results = self._send_query_to_neo4j(cypher_query, qedge_key, kg_name, log) if log.status != 'OK': return answer_kg answer_kg = self._load_answers_into_kg(neo4j_results, kg_name, qg, log) if log.status != 'OK': return answer_kg return answer_kg
def answer_single_node_query(self, single_node_qg: QueryGraph) -> QGOrganizedKnowledgeGraph: log = self.response qnode_key = next(qnode_key for qnode_key in single_node_qg.nodes) qnode = single_node_qg.nodes[qnode_key] final_kg = QGOrganizedKnowledgeGraph() # Convert qnode curies as needed (either to synonyms or to canonical versions) if qnode.ids: qnode.ids = eu.get_canonical_curies_list(qnode.ids, log) qnode.categories = None # Important to clear this to avoid discrepancies in types for particular concepts # Send request to plover plover_answer, response_status = self._answer_query_using_plover(single_node_qg, log) if response_status == 200: final_kg = self._load_plover_answer_into_object_model(plover_answer, log) else: log.error(f"Plover returned response of {response_status}. Answer was: {plover_answer}", error_code="RequestFailed") return final_kg
def _prune_highly_connected_nodes(kg: QGOrganizedKnowledgeGraph, qedge_key: str, input_curies: Set[str], input_qnode_key: str, max_edges_per_input_curie: int, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: # First create a lookup of which edges belong to which input curies input_nodes_to_edges_dict = defaultdict(set) for edge_key, edge in kg.edges_by_qg_id[qedge_key].items(): if edge.subject in input_curies: input_nodes_to_edges_dict[edge.subject].add(edge_key) if edge.object in input_curies: input_nodes_to_edges_dict[edge.object].add(edge_key) # Then prune down highly-connected nodes (delete edges per input curie in excess of some set limit) for node_key, connected_edge_keys in input_nodes_to_edges_dict.items(): connected_edge_keys_list = list(connected_edge_keys) if len(connected_edge_keys_list) > max_edges_per_input_curie: random.shuffle(connected_edge_keys_list) # Make it random which edges we keep for this input curie edge_keys_to_remove = connected_edge_keys_list[max_edges_per_input_curie:] log.debug(f"Randomly removing {len(edge_keys_to_remove)} edges from answer for input curie {node_key}") for edge_key in edge_keys_to_remove: kg.edges_by_qg_id[qedge_key].pop(edge_key, None) # Document that not all answers for this input curie are included node = kg.nodes_by_qg_id[input_qnode_key].get(node_key) if node: if not node.attributes: node.attributes = [] if not any(attribute.attribute_type_id == "biolink:incomplete_result_set" for attribute in node.attributes): node.attributes.append(Attribute(attribute_type_id="biolink:incomplete_result_set", # TODO: request this as actual biolink item? value_type_id="metatype:Boolean", value=True, attribute_source="infores:rtx-kg2", description=f"This attribute indicates that not all " f"nodes/edges returned as answers for this input " f"curie were included in the final answer due to " f"size limitations. {max_edges_per_input_curie} " f"edges for this input curie were kept.")) # Then delete any nodes orphaned by removal of edges node_keys_used_by_edges = kg.get_all_node_keys_used_by_edges() for qnode_key, nodes in kg.nodes_by_qg_id.items(): orphan_node_keys = set(nodes).difference(node_keys_used_by_edges) if orphan_node_keys: log.debug(f"Removing {len(orphan_node_keys)} {qnode_key} nodes orphaned by the above step") for orphan_node_key in orphan_node_keys: del kg.nodes_by_qg_id[qnode_key][orphan_node_key] return kg
def _load_answers_into_kg( self, neo4j_results: List[Dict[str, List[Dict[str, any]]]], kg_name: str, qg: QueryGraph, log: ARAXResponse ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]: log.debug( f"Processing query results for edge {next(qedge_key for qedge_key in qg.edges)}" ) final_kg = QGOrganizedKnowledgeGraph() edge_to_nodes_map = dict() node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict( neo4j_results[0]) if kg_name == "KG1" else dict() results_table = neo4j_results[0] column_names = [column_name for column_name in results_table] for column_name in column_names: # Load answer nodes into our knowledge graph if column_name.startswith( 'nodes'): # Example column name: 'nodes_n00' column_qnode_key = column_name.replace("nodes_", "", 1) for neo4j_node in results_table.get(column_name): swagger_node_key, swagger_node = self._convert_neo4j_node_to_swagger_node( neo4j_node, kg_name) final_kg.add_node(swagger_node_key, swagger_node, column_qnode_key) # Load answer edges into our knowledge graph elif column_name.startswith( 'edges'): # Example column name: 'edges_e01' column_qedge_key = column_name.replace("edges_", "", 1) for neo4j_edge in results_table.get(column_name): swagger_edge_key, swagger_edge = self._convert_neo4j_edge_to_swagger_edge( neo4j_edge, node_uuid_to_curie_dict, kg_name) # Record which of this edge's nodes correspond to which qnode_key if swagger_edge_key not in edge_to_nodes_map: edge_to_nodes_map[swagger_edge_key] = dict() for qnode_key in qg.nodes: edge_to_nodes_map[swagger_edge_key][ qnode_key] = neo4j_edge.get(qnode_key) # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, column_qedge_key) return final_kg, edge_to_nodes_map
def answer_one_hop_query( self, query_graph: QueryGraph) -> QGOrganizedKnowledgeGraph: """ This function answers a one-hop (single-edge) query using CHP client. :param query_graph: A TRAPI query graph. :return: An (almost) TRAPI knowledge graph containing all of the nodes and edges returned as results for the query. (Organized by QG IDs.) """ # Set up the required parameters log = self.response self.CHP_survival_threshold = float( self.response.data['parameters']['CHP_survival_threshold']) allowable_curies = self.client.curies() self.allowable_gene_curies = list( allowable_curies['biolink:Gene'].keys()) self.allowable_drug_curies = [ curie_id.replace('CHEMBL:', 'CHEMBL.COMPOUND:') for curie_id in list(allowable_curies['biolink:Drug'].keys()) ] final_kg = QGOrganizedKnowledgeGraph() final_kg = self._answer_query_using_CHP_client(query_graph, log) return final_kg
def _load_plover_answer_into_object_model(self, plover_answer: Dict[str, Dict[str, Union[set, dict]]], log: ARAXResponse) -> QGOrganizedKnowledgeGraph: answer_kg = QGOrganizedKnowledgeGraph() # Load returned nodes into TRAPI object model for qnode_key, nodes in plover_answer["nodes"].items(): num_nodes = len(nodes) log.debug(f"Loading {num_nodes} {qnode_key} nodes into TRAPI object model") start = time.time() for node_key, node_tuple in nodes.items(): node = self._convert_kg2c_plover_node_to_trapi_node(node_tuple) answer_kg.add_node(node_key, node, qnode_key) log.debug(f"Loading {num_nodes} {qnode_key} nodes into TRAPI object model took " f"{round(time.time() - start, 2)} seconds") # Load returned edges into TRAPI object model for qedge_key, edges in plover_answer["edges"].items(): num_edges = len(edges) log.debug(f"Loading {num_edges} edges into TRAPI object model") start = time.time() for edge_key, edge_tuple in edges.items(): edge = self._convert_kg2c_plover_edge_to_trapi_edge(edge_tuple) answer_kg.add_edge(edge_key, edge, qedge_key) log.debug(f"Loading {num_edges} {qedge_key} edges into TRAPI object model took " f"{round(time.time() - start, 2)} seconds") return answer_kg
def answer_one_hop_query( self, query_graph: QueryGraph) -> QGOrganizedKnowledgeGraph: """ This function answers a one-hop (single-edge) query using either KG1 or KG2. :param query_graph: A TRAPI query graph. :return: An (almost) TRAPI knowledge graph containing all of the nodes and edges returned as results for the query. (Organized by QG IDs.) """ log = self.response enforce_directionality = self.enforce_directionality use_synonyms = self.use_synonyms kg_name = self.kg_name if kg_name == "KG1": query_graph = eu.make_qg_use_old_snake_case_types(query_graph) final_kg = QGOrganizedKnowledgeGraph() # Verify this is a valid one-hop query graph if len(query_graph.edges) != 1: log.error( f"answer_one_hop_query() was passed a query graph that is not one-hop: " f"{query_graph.to_dict()}", error_code="InvalidQuery") return final_kg if len(query_graph.nodes) != 2: log.error( f"answer_one_hop_query() was passed a query graph with more than two nodes: " f"{query_graph.to_dict()}", error_code="InvalidQuery") return final_kg qedge_key = next(qedge_key for qedge_key in query_graph.edges) # Consider any inverses of our predicate(s) as well query_graph = self._add_inverted_predicates(query_graph, log) # Convert qnode curies as needed (either to synonyms or to canonical versions) qnode_keys_with_curies = [ qnode_key for qnode_key, qnode in query_graph.nodes.items() if qnode.id ] for qnode_key in qnode_keys_with_curies: qnode = query_graph.nodes[qnode_key] if use_synonyms and kg_name == "KG1": qnode.id = eu.get_curie_synonyms(qnode.id, log) elif kg_name == "KG2c": canonical_curies = eu.get_canonical_curies_list(qnode.id, log) log.debug( f"Using {len(canonical_curies)} curies as canonical curies for qnode {qnode_key}" ) qnode.id = canonical_curies qnode.category = [ ] # Important to clear this, otherwise results are limited (#889) if kg_name == "KG2c": # Use Plover to answer KG2c queries plover_answer, response_status = self._answer_query_using_plover( query_graph, log) if response_status == 200: final_kg = self._grab_nodes_and_edges_from_sqlite( plover_answer, kg_name, log) else: # Backup to using neo4j in the event plover failed log.warning( f"Plover returned a {response_status} response, so I'm backing up to Neo4j.." ) final_kg = self._answer_query_using_neo4j( query_graph, kg_name, qedge_key, enforce_directionality, log) else: # Use Neo4j for KG2 and KG1 queries final_kg = self._answer_query_using_neo4j(query_graph, kg_name, qedge_key, enforce_directionality, log) return final_kg
def answer_one_hop_query( self, query_graph: QueryGraph ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]: """ This function answers a one-hop (single-edge) query using NGD (with the assistance of KG2). :param query_graph: A Reasoner API standard query graph. :return: A tuple containing: 1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as results for the query. (Dictionary version, organized by QG IDs.) 2. a map of which nodes fulfilled which qnode_keys for each edge. Example: {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}} """ log = self.response final_kg = QGOrganizedKnowledgeGraph() edge_to_nodes_map = dict() # Verify this is a valid one-hop query graph self._verify_one_hop_query_graph_is_valid(query_graph, log) if log.status != 'OK': return final_kg, edge_to_nodes_map # Find potential answers using KG2 log.debug(f"Finding potential answers using KG2") qedge_key = next(qedge_key for qedge_key in query_graph.edges) qedge = query_graph.edges[qedge_key] source_qnode_key = qedge.subject target_qnode_key = qedge.object source_qnode = query_graph.nodes[source_qnode_key] target_qnode = query_graph.nodes[target_qnode_key] qedge_params_str = ", ".join( list( filter(None, [ f"key={qedge_key}", f"subject={source_qnode_key}", f"object={target_qnode_key}", self._get_dsl_qedge_type_str(qedge) ]))) source_params_str = ", ".join( list( filter(None, [ f"key={source_qnode_key}", self._get_dsl_qnode_curie_str(source_qnode), self._get_dsl_qnode_category_str(source_qnode) ]))) target_params_str = ", ".join( list( filter(None, [ f"key={target_qnode_key}", self._get_dsl_qnode_curie_str(target_qnode), self._get_dsl_qnode_category_str(target_qnode) ]))) actions_list = [ f"add_qnode({source_params_str})", f"add_qnode({target_params_str})", f"add_qedge({qedge_params_str})", f"expand(kp=ARAX/KG2)", f"return(message=true, store=false)", ] kg2_response, kg2_message = self._run_arax_query(actions_list, log) if log.status != 'OK': return final_kg, edge_to_nodes_map # Go through those answers from KG2 and calculate ngd for each edge log.debug(f"Calculating NGD between each potential node pair") kg2_answer_kg = kg2_message.knowledge_graph cngd = ComputeNGD(log, kg2_message, None) cngd.load_curie_to_pmids_data(kg2_answer_kg.nodes) kg2_edge_ngd_map = dict() for kg2_edge_key, kg2_edge in kg2_answer_kg.edges.items(): kg2_node_1_key = kg2_edge.subject kg2_node_2_key = kg2_edge.object kg2_node_1 = kg2_answer_kg.nodes.get( kg2_node_1_key ) # These are already canonicalized (default behavior) kg2_node_2 = kg2_answer_kg.nodes.get(kg2_node_2_key) # Figure out which node corresponds to source qnode (don't necessarily match b/c query was bidirectional) if source_qnode_key in kg2_node_1.qnode_keys and target_qnode_key in kg2_node_2.qnode_keys: ngd_subject = kg2_node_1_key ngd_object = kg2_node_2_key else: ngd_subject = kg2_node_2_key ngd_object = kg2_node_1_key ngd_value = cngd.calculate_ngd_fast(ngd_subject, ngd_object) kg2_edge_ngd_map[kg2_edge_key] = { "ngd_value": ngd_value, "subject": ngd_subject, "object": ngd_object } # Create edges for those from KG2 found to have a low enough ngd value threshold = 0.5 log.debug( f"Creating edges between node pairs with NGD below the threshold ({threshold})" ) for kg2_edge_key, ngd_info_dict in kg2_edge_ngd_map.items(): ngd_value = ngd_info_dict['ngd_value'] if ngd_value is not None and ngd_value < threshold: # TODO: Make determination of the threshold much more sophisticated subject = ngd_info_dict["subject"] object = ngd_info_dict["object"] ngd_edge_key, ngd_edge = self._create_ngd_edge( ngd_value, subject, object) ngd_source_node_key, ngd_source_node = self._create_ngd_node( ngd_edge.subject, kg2_answer_kg.nodes.get(ngd_edge.subject)) ngd_target_node_key, ngd_target_node = self._create_ngd_node( ngd_edge.object, kg2_answer_kg.nodes.get(ngd_edge.object)) final_kg.add_edge(ngd_edge_key, ngd_edge, qedge_key) final_kg.add_node(ngd_source_node_key, ngd_source_node, source_qnode_key) final_kg.add_node(ngd_target_node_key, ngd_target_node, target_qnode_key) edge_to_nodes_map[ngd_edge_key] = { source_qnode_key: ngd_source_node_key, target_qnode_key: ngd_target_node_key } return final_kg, edge_to_nodes_map
def _answer_query_using_CHP_client( self, query_graph: QueryGraph, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: qedge_key = next(qedge_key for qedge_key in query_graph.edges) log.debug( f"Processing query results for edge {qedge_key} by using CHP client" ) final_kg = QGOrganizedKnowledgeGraph() gene_label_list = ['gene'] drug_label_list = ['drug', 'chemicalsubstance'] # use for checking the requirement source_pass_nodes = None source_category = None target_pass_nodes = None target_category = None qedge = query_graph.edges[qedge_key] source_qnode_key = qedge.subject target_qnode_key = qedge.object source_qnode = query_graph.nodes[source_qnode_key] target_qnode = query_graph.nodes[target_qnode_key] # check if both ends of edge have no curie if (source_qnode.id is None) and (target_qnode.id is None): log.error(f"Both ends of edge {qedge_key} are None", error_code="BadEdge") return final_kg # check if the query nodes are drug or disease if source_qnode.id is not None: if type(source_qnode.id) is str: source_pass_nodes = [source_qnode.id] else: source_pass_nodes = source_qnode.id has_error, pass_nodes, not_pass_nodes = self._check_id( source_qnode.id, log) if has_error: return final_kg else: if len(not_pass_nodes) == 0 and len(pass_nodes) != 0: source_pass_nodes = pass_nodes elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0: source_pass_nodes = pass_nodes if len(not_pass_nodes) == 1: log.warning( f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client" ) else: log.warning( f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client" ) else: if type(source_qnode.id) is str: log.error( f"The curie id of {source_qnode.id} is not allowable based on CHP client", error_code="NotAllowable") return final_kg else: log.error( f"The curie ids of {source_qnode.id} are not allowable based on CHP client", error_code="NotAllowable") return final_kg else: category = source_qnode.category[0].replace( 'biolink:', '').replace('_', '').lower() source_category = category if (category in drug_label_list) or (category in gene_label_list): source_category = category else: log.error( f"The category of query node {source_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene", error_code="CategoryError") return final_kg if target_qnode.id is not None: if type(target_qnode.id) is str: target_pass_nodes = [target_qnode.id] else: target_pass_nodes = target_qnode.id has_error, pass_nodes, not_pass_nodes = self._check_id( target_qnode.id, log) if has_error: return final_kg else: if len(not_pass_nodes) == 0 and len(pass_nodes) != 0: target_pass_nodes = pass_nodes elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0: target_pass_nodes = pass_nodes if len(not_pass_nodes) == 1: log.warning( f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client" ) else: log.warning( f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client" ) else: if type(target_qnode.id) is str: log.error( f"The curie id of {target_qnode.id} is not allowable based on CHP client", error_code="CategoryError") return final_kg else: log.error( f"The curie ids of {target_qnode.id} are not allowable based on CHP client", error_code="CategoryError") return final_kg else: category = target_qnode.category[0].replace( 'biolink:', '').replace('_', '').lower() target_category = category if (category in drug_label_list) or (category in gene_label_list): target_category = category else: log.error( f"The category of query node {target_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene", error_code="CategoryError") return final_kg if (source_pass_nodes is None) and (target_pass_nodes is None): return final_kg elif (source_pass_nodes is not None) and (target_pass_nodes is not None): source_dict = dict() target_dict = dict() if source_pass_nodes[0] in self.allowable_drug_curies: source_category_temp = 'drug' else: source_category_temp = 'gene' if target_pass_nodes[0] in self.allowable_drug_curies: target_category_temp = 'drug' else: target_category_temp = 'gene' if source_category_temp == target_category_temp: log.error( f"The query nodes in both ends of edge are the same type which is {source_category_temp}", error_code="CategoryError") return final_kg else: for (source_curie, target_curie) in itertools.product( source_pass_nodes, target_pass_nodes): if source_category_temp == 'drug': source_curie_temp = source_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') # Let's build a simple single query q = build_query(genes=[target_curie], therapeutic=source_curie_temp, disease='MONDO:0007254', outcome=('EFO:0000714', '>=', self.CHP_survival_threshold)) response = self.client.query(q) max_probability = self.client.get_outcome_prob( response) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( target_curie, source_curie, "paired_with", max_probability) else: target_curie_temp = target_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') # Let's build a simple single query q = build_query(genes=[source_curie], therapeutic=target_curie_temp, disease='MONDO:0007254', outcome=('EFO:0000714', '>=', self.CHP_survival_threshold)) response = self.client.query(q) max_probability = self.client.get_outcome_prob( response) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( source_curie, target_curie, "paired_with", max_probability) source_dict[source_curie] = source_qnode_key target_dict[target_curie] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) # Add the nodes to our answer knowledge graph if len(source_dict) != 0: for source_curie in source_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( source_curie) final_kg.add_node(swagger_node_key, swagger_node, source_dict[source_curie]) if len(target_dict) != 0: for target_curie in target_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( target_curie) final_kg.add_node(swagger_node_key, swagger_node, target_dict[target_curie]) return final_kg elif source_pass_nodes is not None: source_dict = dict() target_dict = dict() if source_pass_nodes[0] in self.allowable_drug_curies: source_category_temp = 'drug' else: source_category_temp = 'gene' if target_category in drug_label_list: target_category_temp = 'drug' else: target_category_temp = 'gene' if source_category_temp == target_category_temp: log.error( f"The query nodes in both ends of edge are the same type which is {source_category_temp}", error_code="CategoryError") return final_kg else: if source_category_temp == 'drug': for source_curie in source_pass_nodes: genes = [ curie for curie in self.allowable_gene_curies if self.synonymizer.get_canonical_curies(curie) [curie] is not None and target_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie, return_all_categories=True) [curie]['all_categories'].keys()) ] ] therapeutic = source_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for gene in genes: queries.append( build_query( genes=[gene], therapeutic=therapeutic, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, gene in zip(res["message"], genes): prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( gene, source_curie, "paired_with", prob) source_dict[source_curie] = source_qnode_key target_dict[gene] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) else: for source_curie in source_pass_nodes: genes = [source_curie] therapeutic = [ curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:') for curie in self.allowable_drug_curies if self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')) [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')] is not None and target_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'), return_all_categories=True)[ curie.replace( 'CHEMBL:', 'CHEMBL.COMPOUND:')] ['all_categories'].keys()) ] ] disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for drug in therapeutic: queries.append( build_query( genes=genes, therapeutic=drug, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, drug in zip(res["message"], therapeutic): drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:') prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( source_curie, drug, "paired_with", prob) source_dict[source_curie] = source_qnode_key target_dict[drug] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) # Add the nodes to our answer knowledge graph if len(source_dict) != 0: for source_curie in source_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( source_curie) final_kg.add_node(swagger_node_key, swagger_node, source_dict[source_curie]) if len(target_dict) != 0: for target_curie in target_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( target_curie) final_kg.add_node(swagger_node_key, swagger_node, target_dict[target_curie]) return final_kg else: source_dict = dict() target_dict = dict() if target_pass_nodes[0] in self.allowable_drug_curies: target_category_temp = 'drug' else: target_category_temp = 'gene' if source_category in drug_label_list: source_category_temp = 'drug' else: source_category_temp = 'gene' if source_category_temp == target_category_temp: log.error( f"The query nodes in both ends of edge are the same type which is {source_category_temp}", error_code="CategoryError") return final_kg else: if target_category_temp == 'drug': for target_curie in target_pass_nodes: genes = [ curie for curie in self.allowable_gene_curies if self.synonymizer.get_canonical_curies(curie) [curie] is not None and source_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie, return_all_categories=True) [curie]['all_categories'].keys()) ] ] therapeutic = target_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for gene in genes: queries.append( build_query( genes=[gene], therapeutic=therapeutic, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, gene in zip(res["message"], genes): prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( gene, target_curie, "paired_with", prob) source_dict[gene] = source_qnode_key target_dict[target_curie] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) else: for target_curie in target_pass_nodes: genes = [target_curie] therapeutic = [ curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:') for curie in self.allowable_drug_curies if self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')) [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')] is not None and source_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'), return_all_categories=True)[ curie.replace( 'CHEMBL:', 'CHEMBL.COMPOUND:')] ['all_categories'].keys()) ] ] disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for drug in therapeutic: queries.append( build_query( genes=genes, therapeutic=drug, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, drug in zip(res["message"], therapeutic): drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:') prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( target_curie, drug, "paired_with", prob) source_dict[drug] = source_qnode_key target_dict[target_curie] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) # Add the nodes to our answer knowledge graph if len(source_dict) != 0: for source_curie in source_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( source_curie) final_kg.add_node(swagger_node_key, swagger_node, source_dict[source_curie]) if len(target_dict) != 0: for target_curie in target_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( target_curie) final_kg.add_node(swagger_node_key, swagger_node, target_dict[target_curie]) return final_kg
def answer_one_hop_query( self, query_graph: QueryGraph ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]: """ This function answers a one-hop (single-edge) query using either KG1 or KG2. :param query_graph: A Reasoner API standard query graph. :return: A tuple containing: 1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as results for the query. (Dictionary version, organized by QG IDs.) 2. a map of which nodes fulfilled which qnode_keys for each edge. Example: {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}} """ log = self.response enforce_directionality = self.enforce_directionality use_synonyms = self.use_synonyms kg_name = self.kg_name final_kg = QGOrganizedKnowledgeGraph() edge_to_nodes_map = dict() query_graph = eu.make_qg_use_old_types( query_graph) # Temporary patch until we switch to KG2.5.1 # Verify this is a valid one-hop query graph if len(query_graph.edges) != 1: log.error( f"answer_one_hop_query() was passed a query graph that is not one-hop: " f"{query_graph.to_dict()}", error_code="InvalidQuery") return final_kg, edge_to_nodes_map if len(query_graph.nodes) != 2: log.error( f"answer_one_hop_query() was passed a query graph with more than two nodes: " f"{query_graph.to_dict()}", error_code="InvalidQuery") return final_kg, edge_to_nodes_map qedge_key = next(qedge_key for qedge_key in query_graph.edges) # Convert qnode curies as needed (either to synonyms or to canonical versions) qnode_keys_with_curies = [ qnode_key for qnode_key, qnode in query_graph.nodes.items() if qnode.id ] for qnode_key in qnode_keys_with_curies: qnode = query_graph.nodes[qnode_key] if use_synonyms and kg_name == "KG1": qnode.id = eu.get_curie_synonyms(qnode.id, log) elif kg_name == "KG2c": canonical_curies = eu.get_canonical_curies_list(qnode.id, log) log.debug( f"Using {len(canonical_curies)} curies as canonical curies for qnode {qnode_key}" ) qnode.id = canonical_curies qnode.category = [ ] # Important to clear this, otherwise results are limited (#889) # Run the actual query and process results cypher_query = self._convert_one_hop_query_graph_to_cypher_query( query_graph, enforce_directionality, kg_name, log) if log.status != 'OK': return final_kg, edge_to_nodes_map neo4j_results = self._answer_query_using_neo4j(cypher_query, qedge_key, kg_name, log) if log.status != 'OK': return final_kg, edge_to_nodes_map final_kg, edge_to_nodes_map = self._load_answers_into_kg( neo4j_results, kg_name, query_graph, log) if log.status != 'OK': return final_kg, edge_to_nodes_map # TODO: remove this patch once we switch to KG2.5.0! eu.convert_node_and_edge_types_to_new_format(final_kg) return final_kg, edge_to_nodes_map
def answer_one_hop_query(self, query_graph: QueryGraph) -> QGOrganizedKnowledgeGraph: """ This function answers a one-hop (single-edge) query using KG2c, via PloverDB. :param query_graph: A TRAPI query graph. :return: An (almost) TRAPI knowledge graph containing all of the nodes and edges returned as results for the query. (Organized by QG IDs.) """ log = self.response final_kg = QGOrganizedKnowledgeGraph() # Verify this is a valid one-hop query graph if len(query_graph.edges) != 1: log.error(f"answer_one_hop_query() was passed a query graph that is not one-hop: " f"{query_graph.to_dict()}", error_code="InvalidQuery") return final_kg if len(query_graph.nodes) != 2: log.error(f"answer_one_hop_query() was passed a query graph with more than two nodes: " f"{query_graph.to_dict()}", error_code="InvalidQuery") return final_kg # Get canonical versions of the input curies qnode_keys_with_curies = [qnode_key for qnode_key, qnode in query_graph.nodes.items() if qnode.ids] for qnode_key in qnode_keys_with_curies: qnode = query_graph.nodes[qnode_key] canonical_curies = eu.get_canonical_curies_list(qnode.ids, log) log.debug(f"Using {len(canonical_curies)} curies as canonical curies for qnode {qnode_key}") qnode.ids = canonical_curies qnode.categories = None # Important to clear this, otherwise results are limited (#889) # Send the query to plover in batches of input curies qedge_key = next(qedge_key for qedge_key in query_graph.edges) input_qnode_key = self._get_input_qnode_key(query_graph) input_curies = query_graph.nodes[input_qnode_key].ids input_curie_set = set(input_curies) curie_batches = [input_curies[i:i+self.curie_batch_size] for i in range(0, len(input_curies), self.curie_batch_size)] log.debug(f"Split {len(input_curies)} input curies into {len(curie_batches)} batches to send to Plover") log.info(f"Max edges allowed per input curie for this query is: {self.max_edges_per_input_curie}") batch_num = 1 for curie_batch in curie_batches: log.debug(f"Sending batch {batch_num} to Plover (has {len(curie_batch)} input curies)") query_graph.nodes[input_qnode_key].ids = curie_batch plover_answer, response_status = self._answer_query_using_plover(query_graph, log) if response_status == 200: batch_kg = self._load_plover_answer_into_object_model(plover_answer, log) final_kg = eu.merge_two_kgs(batch_kg, final_kg) # Prune down highly-connected input curies if we're over the max number of allowed edges if final_kg.edges_by_qg_id.get(qedge_key): if len(final_kg.edges_by_qg_id[qedge_key]) > self.max_allowed_edges: log.debug(f"Have exceeded max num allowed edges ({self.max_allowed_edges}); will attempt to " f"reduce the number of edges by pruning down highly connected nodes") final_kg = self._prune_highly_connected_nodes(final_kg, qedge_key, input_curie_set, input_qnode_key, self.max_edges_per_input_curie, log) # Error out if this pruning wasn't sufficient to bring down the edge count if len(final_kg.edges_by_qg_id[qedge_key]) > self.max_allowed_edges: log.error(f"Query for qedge {qedge_key} produced more than {self.max_allowed_edges} edges, " f"which is too much for the system to handle. You must somehow make your query " f"smaller (specify fewer input curies or use more specific predicates/categories).", error_code="QueryTooLarge") return final_kg else: log.error(f"Plover returned response of {response_status}. Answer was: {plover_answer}", error_code="RequestFailed") return final_kg batch_num += 1 return final_kg
def answer_one_hop_query( self, query_graph: QueryGraph) -> QGOrganizedKnowledgeGraph: """ This function answers a one-hop (single-edge) query using NGD (with the assistance of KG2). :param query_graph: A TRAPI query graph. :return: An (almost) TRAPI knowledge graph containing all of the nodes and edges returned as results for the query. (Organized by QG IDs.) """ log = self.response final_kg = QGOrganizedKnowledgeGraph() # Verify this is a valid one-hop query graph self._verify_one_hop_query_graph_is_valid(query_graph, log) if log.status != 'OK': return final_kg qedge_key = next(qedge_key for qedge_key in query_graph.edges) qedge = query_graph.edges[qedge_key] if qedge.predicates and not set(qedge.predicates).intersection( self.accepted_qedge_predicates): log.error( f"NGD can only expand qedges with these predicates: {self.accepted_qedge_predicates}. QEdge" f" {qedge_key}'s predicate is: {qedge.predicates}", error_code="UnsupportedQG") return final_kg source_qnode_key = qedge.subject target_qnode_key = qedge.object # Find potential answers using KG2 log.debug(f"Finding potential answers using KG2") modified_qg = copy.deepcopy(query_graph) for qedge in modified_qg.edges.values(): qedge.predicates = None request_body = {"message": {"query_graph": modified_qg.to_dict()}} kg2_response, kg2_message = self._run_arax_query(request_body, log) if log.status != 'OK': return final_kg # Go through those answers from KG2 and calculate ngd for each edge log.debug(f"Calculating NGD between each potential node pair") kg2_answer_kg = kg2_message.knowledge_graph cngd = ComputeNGD(log, kg2_message, None) cngd.load_curie_to_pmids_data(kg2_answer_kg.nodes) kg2_edge_ngd_map = dict() for kg2_edge_key, kg2_edge in kg2_answer_kg.edges.items(): kg2_node_1_key = kg2_edge.subject kg2_node_2_key = kg2_edge.object kg2_node_1 = kg2_answer_kg.nodes.get( kg2_node_1_key ) # These are already canonicalized (default behavior) kg2_node_2 = kg2_answer_kg.nodes.get(kg2_node_2_key) # Figure out which node corresponds to source qnode (don't necessarily match b/c query was bidirectional) if source_qnode_key in kg2_node_1.qnode_keys and target_qnode_key in kg2_node_2.qnode_keys: ngd_subject = kg2_node_1_key ngd_object = kg2_node_2_key else: ngd_subject = kg2_node_2_key ngd_object = kg2_node_1_key ngd_value, pmid_set = cngd.calculate_ngd_fast( ngd_subject, ngd_object) kg2_edge_ngd_map[kg2_edge_key] = { "ngd_value": ngd_value, "subject": ngd_subject, "object": ngd_object, "pmids": [f"PMID:{pmid}" for pmid in pmid_set] } # Create edges for those from KG2 found to have a low enough ngd value threshold = 0.5 log.debug( f"Creating edges between node pairs with NGD below the threshold ({threshold})" ) for kg2_edge_key, ngd_info_dict in kg2_edge_ngd_map.items(): ngd_value = ngd_info_dict['ngd_value'] if ngd_value is not None and ngd_value < threshold: # TODO: Make determination of the threshold much more sophisticated subject = ngd_info_dict["subject"] object = ngd_info_dict["object"] pmid_list = ngd_info_dict["pmids"] ngd_edge_key, ngd_edge = self._create_ngd_edge( ngd_value, subject, object, pmid_list) ngd_source_node_key, ngd_source_node = self._create_ngd_node( ngd_edge.subject, kg2_answer_kg.nodes.get(ngd_edge.subject)) ngd_target_node_key, ngd_target_node = self._create_ngd_node( ngd_edge.object, kg2_answer_kg.nodes.get(ngd_edge.object)) final_kg.add_edge(ngd_edge_key, ngd_edge, qedge_key) final_kg.add_node(ngd_source_node_key, ngd_source_node, source_qnode_key) final_kg.add_node(ngd_target_node_key, ngd_target_node, target_qnode_key) return final_kg
def answer_one_hop_query( self, query_graph: QueryGraph ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]: """ This function answers a one-hop (single-edge) query using the Genetics Provider. :param query_graph: A Reasoner API standard query graph. :return: A tuple containing: 1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as results for the query. (Dictionary version, organized by QG IDs.) 2. a map of which nodes fulfilled which qnode_keys for each edge. Example: {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}} """ log = self.response include_all_scores = self.response.data['parameters'][ 'include_all_scores'] final_kg = QGOrganizedKnowledgeGraph() edge_to_nodes_map = dict() query_graph = eu.make_qg_use_old_types( query_graph ) # Temporary patch until TRAPI 1.0 KP endpoint is ready # Verify this is a valid one-hop query graph and tweak its contents as needed for this KP self._verify_one_hop_query_graph_is_valid(query_graph, log) if log.status != 'OK': return final_kg, edge_to_nodes_map modified_query_graph = self._pre_process_query_graph(query_graph, log) if log.status != 'OK': return final_kg, edge_to_nodes_map qedge = next(qedge for qedge in modified_query_graph.edges.values()) source_qnode_key = qedge.subject target_qnode_key = qedge.object # Answer the query using the KP and load its answers into our Swagger model json_response = self._send_query_to_kp(modified_query_graph, log) returned_kg = json_response.get('knowledge_graph') if not returned_kg: log.warning( f"No KG is present in the response from {self.kp_name}") else: # Build a map of node/edge IDs to qnode/qedge IDs qg_id_mappings = self._get_qg_id_mappings_from_results( json_response['results']) unknown_scores_encountered = set() # Populate our final KG with nodes and edges for returned_edge in returned_kg['edges']: # Skip edges missing a source and/or target ID (have encountered these before) if not returned_edge['source_id'] or not returned_edge[ 'target_id']: log.warning( f"Edge returned from GeneticsKP is lacking a subject and/or object: {returned_edge}." f" Will skip adding this edge to the KG.") else: if returned_edge[ 'score_name'] not in self.score_type_lookup: unknown_scores_encountered.add( returned_edge['score_name']) # Always include edges for integrated scores, but only include magma edges if that flag is set if include_all_scores or returned_edge[ 'score_name'] == self.magma_score_name: kp_edge_key, swagger_edge = self._create_swagger_edge_from_kp_edge( returned_edge) swagger_edge_key = self._create_unique_edge_key( swagger_edge ) # Convert to an ID that's unique for us for qedge_key in qg_id_mappings['edges'][kp_edge_key]: final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) edge_to_nodes_map[swagger_edge_key] = { source_qnode_key: swagger_edge.subject, target_qnode_key: swagger_edge.object } if unknown_scores_encountered: log.warning( f"Encountered unknown score(s) from {self.kp_name}: {unknown_scores_encountered}. " f"Not sure what data type to assign these.") for returned_node in returned_kg['nodes']: if returned_node[ 'id']: # Skip any nodes with 'None' for their ID (see discussion in #1154) swagger_node_key, swagger_node = self._create_swagger_node_from_kp_node( returned_node) for qnode_key in qg_id_mappings['nodes'][swagger_node_key]: final_kg.add_node(swagger_node_key, swagger_node, qnode_key) else: log.warning( f"Node returned from {self.kp_name} is lacking an ID: {returned_node}." f" Will skip adding this node to the KG.") return final_kg, edge_to_nodes_map
def _grab_nodes_and_edges_from_sqlite( self, plover_answer: Dict[str, Dict[str, Set[Union[str, int]]]], kg_name: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: # Get connected to the local sqlite database (look up its path using database manager-friendly method) path_list = os.path.realpath(__file__).split(os.path.sep) rtx_index = path_list.index("RTX") rtxc = RTXConfiguration() sqlite_dir_path = os.path.sep.join([ *path_list[:(rtx_index + 1)], 'code', 'ARAX', 'KnowledgeSources', 'KG2c' ]) sqlite_name = rtxc.kg2c_sqlite_path.split('/')[-1] sqlite_file_path = f"{sqlite_dir_path}{os.path.sep}{sqlite_name}" connection = sqlite3.connect(sqlite_file_path) cursor = connection.cursor() answer_kg = QGOrganizedKnowledgeGraph() # Grab the node objects from sqlite corresponding to the returned node IDs num_nodes = sum( [len(nodes) for nodes in plover_answer["nodes"].values()]) start = time.time() for qnode_key, node_keys in plover_answer["nodes"].items(): node_keys_str = "','".join( node_keys ) # SQL wants ('node1', 'node2') format for string lists sql_query = f"SELECT N.node " \ f"FROM nodes AS N " \ f"WHERE N.id IN ('{node_keys_str}')" log.debug( f"Looking up {len(plover_answer['nodes'][qnode_key])} returned {qnode_key} node IDs in KG2c sqlite" ) cursor.execute(sql_query) rows = cursor.fetchall() for row in rows: node_as_dict = ujson.loads(row[0]) node_key, node = self._convert_neo4j_node_to_trapi_node( node_as_dict, kg_name) answer_kg.add_node(node_key, node, qnode_key) log.debug( f"Grabbing {num_nodes} nodes from sqlite and loading into object model took " f"{round(time.time() - start, 2)} seconds") # Grab the edge objects from sqlite corresponding to the returned edge IDs num_edges = sum( [len(edges) for edges in plover_answer["edges"].values()]) start = time.time() for qedge_key, edge_keys in plover_answer["edges"].items(): edge_keys_str = ",".join( str(edge_key) for edge_key in edge_keys) # SQL wants (1, 2) format int lists sql_query = f"SELECT E.edge " \ f"FROM edges AS E " \ f"WHERE E.id IN ({edge_keys_str})" log.debug( f"Looking up {len(plover_answer['edges'][qedge_key])} returned {qedge_key} edge IDs in KG2c sqlite" ) cursor.execute(sql_query) rows = cursor.fetchall() for row in rows: edge_as_dict = ujson.loads(row[0]) edge_key, edge = self._convert_neo4j_edge_to_trapi_edge( edge_as_dict, dict(), kg_name) answer_kg.add_edge(edge_key, edge, qedge_key) log.debug( f"Grabbing {num_edges} edges from sqlite and loading into object model took " f"{round(time.time() - start, 2)} seconds") cursor.close() connection.close() return answer_kg