def _convert_kg2_node_to_swagger_node(self, neo4j_node): swagger_node = Node() swagger_node.id = neo4j_node.get('id') swagger_node.name = neo4j_node.get('name') swagger_node.description = neo4j_node.get('description') swagger_node.uri = neo4j_node.get('iri') swagger_node.node_attributes = [] node_category = neo4j_node.get('category_label') swagger_node.type = eu.convert_string_or_list_to_list(node_category) # Fill out the 'symbol' property (only really relevant for nodes from UniProtKB) if swagger_node.symbol is None and swagger_node.id.lower().startswith( "uniprot"): swagger_node.symbol = neo4j_node.get('name') swagger_node.name = neo4j_node.get('full_name') # Add all additional properties on KG2 nodes as swagger NodeAttribute objects additional_kg2_node_properties = [ 'publications', 'synonym', 'category', 'provided_by', 'deprecated', 'update_date' ] node_attributes = self._create_swagger_attributes( "node", additional_kg2_node_properties, neo4j_node) swagger_node.node_attributes += node_attributes return swagger_node
def _prune_answers_to_achieve_curie_to_curie_query( kg: DictKnowledgeGraph, output_qnode: QNode, qedge: QEdge) -> DictKnowledgeGraph: """ This is a way of hacking around BTE's limitation where it can only do (node with curie)-->(non-specific node) kinds of queries. We do the non-specific query, and then use this function to remove all of the answer nodes that do not correspond to the curie we wanted for the 'output' node. """ # Remove 'output' nodes in the KG that aren't actually the ones we were looking for desired_output_curies = set( eu.convert_string_or_list_to_list(output_qnode.curie)) all_output_node_ids = set(kg.nodes_by_qg_id[output_qnode.id]) output_node_ids_to_remove = all_output_node_ids.difference( desired_output_curies) for node_id in output_node_ids_to_remove: kg.nodes_by_qg_id[output_qnode.id].pop(node_id) # And remove any edges that used them edge_ids_to_remove = set() for edge_id, edge in kg.edges_by_qg_id[qedge.id].items(): if edge.target_id in output_node_ids_to_remove: # Edge target_id always contains output node ID for BTE edge_ids_to_remove.add(edge_id) for edge_id in edge_ids_to_remove: kg.edges_by_qg_id[qedge.id].pop(edge_id) return kg
def _override_qnode_types_as_needed(self, query_graph: QueryGraph) -> QueryGraph: for qnode_key, qnode in query_graph.nodes.items(): overriden_categories = { self.node_category_overrides_for_kp.get( qnode_category, qnode_category) for qnode_category in eu.convert_string_or_list_to_list( qnode.category) } qnode.category = list(overriden_categories) return query_graph
def _build_kg_to_qg_id_dict(results): kg_to_qg_ids = {'nodes': dict(), 'edges': dict()} for node_binding in results['node_bindings']: node_id = node_binding['kg_id'] qnode_id = node_binding['qg_id'] kg_to_qg_ids['nodes'][node_id] = qnode_id for edge_binding in results['edge_bindings']: edge_ids = eu.convert_string_or_list_to_list(edge_binding['kg_id']) qedge_ids = edge_binding['qg_id'] for kg_id in edge_ids: kg_to_qg_ids['edges'][kg_id] = qedge_ids return kg_to_qg_ids
def _convert_kg1_node_to_swagger_node(neo4j_node: Dict[str, any]) -> Node: swagger_node = Node() swagger_node.id = neo4j_node.get('id') swagger_node.name = neo4j_node.get('name') swagger_node.symbol = neo4j_node.get('symbol') swagger_node.description = neo4j_node.get('description') swagger_node.uri = neo4j_node.get('uri') swagger_node.node_attributes = [] node_category = neo4j_node.get('category') swagger_node.type = eu.convert_string_or_list_to_list(node_category) return swagger_node
def _pre_process_query_graph(self, query_graph: QueryGraph, log: ARAXResponse) -> QueryGraph: for qnode_key, qnode in query_graph.nodes.items(): # Convert node types to preferred format and verify we can do this query formatted_qnode_categories = { self.node_category_overrides_for_kp.get( qnode_category, qnode_category) for qnode_category in eu.convert_string_or_list_to_list( qnode.category) } accepted_qnode_categories = formatted_qnode_categories.intersection( self.accepted_node_categories) if not accepted_qnode_categories: log.error( f"{self.kp_name} can only be used for queries involving {self.accepted_node_categories} " f"and QNode {qnode_key} has category '{qnode.category}'", error_code="UnsupportedQueryForKP") return query_graph else: qnode.category = list(accepted_qnode_categories)[0] # Convert curies to equivalent curies accepted by the KP (depending on qnode type) if qnode.id: equivalent_curies = eu.get_curie_synonyms(qnode.id, log) desired_curies = [ curie for curie in equivalent_curies if curie.startswith( f"{self.kp_preferred_prefixes[qnode.category]}:") ] if desired_curies: qnode.id = desired_curies if len( desired_curies) > 1 else desired_curies[0] log.debug( f"Converted qnode {qnode_key} curie to {qnode.id}") else: log.warning( f"Could not convert qnode {qnode_key} curie(s) to preferred prefix ({self.kp_preferred_prefixes[qnode.category]})" ) return query_graph
def apply(self, input_message, input_parameters, response=None): if response is None: response = Response() self.response = response self.message = input_message # Basic checks on arguments if not isinstance(input_parameters, dict): response.error("Provided parameters is not a dict", error_code="ParametersNotDict") return response # Define a complete set of allowed parameters and their defaults parameters = self.parameters parameters['kp'] = "ARAX/KG1" parameters['enforce_directionality'] = False parameters['use_synonyms'] = True parameters['synonym_handling'] = 'map_back' parameters['continue_if_no_results'] = False for key, value in input_parameters.items(): if key and key not in parameters: response.error(f"Supplied parameter {key} is not permitted", error_code="UnknownParameter") else: if type(value) is str and value.lower() == "true": value = True elif type(value) is str and value.lower() == "false": value = False parameters[key] = value # Default to expanding the entire query graph if the user didn't specify what to expand if not parameters['edge_id'] and not parameters['node_id']: parameters['edge_id'] = [ edge.id for edge in self.message.query_graph.edges ] parameters['node_id'] = self._get_orphan_query_node_ids( self.message.query_graph) if response.status != 'OK': return response response.data['parameters'] = parameters self.parameters = parameters # Do the actual expansion response.debug( f"Applying Expand to Message with parameters {parameters}") input_edge_ids = eu.convert_string_or_list_to_list( parameters['edge_id']) input_node_ids = eu.convert_string_or_list_to_list( parameters['node_id']) kp_to_use = self.parameters['kp'] continue_if_no_results = self.parameters['continue_if_no_results'] # Convert message knowledge graph to dictionary format, for faster processing dict_kg = eu.convert_standard_kg_to_dict_kg( self.message.knowledge_graph) # Expand any specified edges if input_edge_ids: query_sub_graph = self._extract_query_subgraph( input_edge_ids, self.message.query_graph) if response.status != 'OK': return response self.response.debug( f"Query graph for this Expand() call is: {query_sub_graph.to_dict()}" ) # Expand the query graph edge by edge (much faster for neo4j queries, and allows easy integration with BTE) ordered_qedges_to_expand = self._get_order_to_expand_edges_in( query_sub_graph) node_usages_by_edges_map = dict() for qedge in ordered_qedges_to_expand: answer_kg, edge_node_usage_map = self._expand_edge( qedge, kp_to_use, dict_kg, continue_if_no_results, self.message.query_graph) if response.status != 'OK': return response node_usages_by_edges_map[qedge.id] = edge_node_usage_map self._process_and_merge_answer(answer_kg, dict_kg) if response.status != 'OK': return response self._prune_dead_end_paths(dict_kg, query_sub_graph, node_usages_by_edges_map) if response.status != 'OK': return response # Expand any specified nodes if input_node_ids: for qnode_id in input_node_ids: answer_kg = self._expand_node(qnode_id, kp_to_use, continue_if_no_results, self.message.query_graph) if response.status != 'OK': return response self._process_and_merge_answer(answer_kg, dict_kg) if response.status != 'OK': return response # Convert message knowledge graph back to API standard format self.message.knowledge_graph = eu.convert_dict_kg_to_standard_kg( dict_kg) # Return the response and done kg = self.message.knowledge_graph response.info( f"After Expand, Message.KnowledgeGraph has {len(kg.nodes)} nodes and {len(kg.edges)} edges" ) return response
def _deduplicate_nodes( dict_kg: DictKnowledgeGraph, edge_to_nodes_map: Dict[str, Dict[str, str]], log: Response ) -> Tuple[DictKnowledgeGraph, Dict[str, Dict[str, str]]]: log.debug(f"Deduplicating nodes") deduplicated_kg = DictKnowledgeGraph( nodes={qnode_id: dict() for qnode_id in dict_kg.nodes_by_qg_id}, edges={qedge_id: dict() for qedge_id in dict_kg.edges_by_qg_id}) updated_edge_to_nodes_map = { edge_id: dict() for edge_id in edge_to_nodes_map } curie_mappings = dict() # First deduplicate the nodes for qnode_id, nodes in dict_kg.nodes_by_qg_id.items(): # Load preferred curie info from NodeSynonymizer for nodes we haven't seen before unmapped_node_ids = set(nodes).difference(set(curie_mappings)) log.debug( f"Getting preferred curies for {qnode_id} nodes returned in this step" ) canonicalized_nodes = eu.get_preferred_curies( list(unmapped_node_ids), log) if unmapped_node_ids else dict() if log.status != 'OK': return deduplicated_kg, updated_edge_to_nodes_map for node_id in unmapped_node_ids: # Figure out the preferred curie/name for this node node = nodes.get(node_id) canonicalized_node = canonicalized_nodes.get(node_id) if canonicalized_node: preferred_curie = canonicalized_node.get( 'preferred_curie', node_id) preferred_name = canonicalized_node.get( 'preferred_name', node.name) preferred_type = eu.convert_string_or_list_to_list( canonicalized_node.get('preferred_type', node.type)) curie_mappings[node_id] = preferred_curie else: # Means the NodeSynonymizer didn't recognize this curie preferred_curie = node_id preferred_name = node.name preferred_type = node.type curie_mappings[node_id] = preferred_curie # Add this node into our deduplicated KG as necessary # TODO: merge certain fields, like uri? if preferred_curie not in deduplicated_kg.nodes_by_qg_id[ qnode_id]: node.id = preferred_curie node.name = preferred_name node.type = preferred_type deduplicated_kg.add_node(node, qnode_id) # Then update the edges to reflect changes made to the nodes for qedge_id, edges in dict_kg.edges_by_qg_id.items(): for edge_id, edge in edges.items(): edge.source_id = curie_mappings.get(edge.source_id) edge.target_id = curie_mappings.get(edge.target_id) if not edge.source_id or not edge.target_id: log.error( f"Could not find preferred curie mappings for edge {edge_id}'s node(s)" ) return deduplicated_kg, updated_edge_to_nodes_map deduplicated_kg.add_edge(edge, qedge_id) # Update the edge-to-node map for this edge (used down the line for pruning) for qnode_id, corresponding_node_id in edge_to_nodes_map[ edge_id].items(): updated_edge_to_nodes_map[edge_id][ qnode_id] = curie_mappings.get(corresponding_node_id) log.debug( f"After deduplication, answer KG counts are: {eu.get_printable_counts_by_qg_id(deduplicated_kg)}" ) return deduplicated_kg, updated_edge_to_nodes_map
def _send_query_to_kp(self, query_graph: QueryGraph, log: ARAXResponse) -> Dict[str, any]: # Send query to their API (stripping down qnode/qedges to only the properties they like) stripped_qnodes = dict() for qnode_key, qnode in query_graph.nodes.items(): stripped_qnode = {'category': qnode.category} if qnode.id: stripped_qnode['id'] = qnode.id stripped_qnodes[qnode_key] = stripped_qnode qedge_key = next(qedge_key for qedge_key in query_graph.edges) # Our query graph is single-edge qedge = query_graph.edges[qedge_key] stripped_qedge = { 'subject': qedge.subject, 'object': qedge.object, 'predicate': qedge.predicate if qedge.predicate else list(self.accepted_edge_types)[0] } if stripped_qedge['predicate'] not in self.accepted_edge_types: log.warning( f"{self.kp_name} only accepts the following edge types: {self.accepted_edge_types}" ) source_stripped_qnode = stripped_qnodes[qedge.subject] input_curies = eu.convert_string_or_list_to_list( source_stripped_qnode['id']) combined_message = dict() for input_curie in input_curies: # Until we have batch querying, ping them one-by-one for each input curie log.debug( f"Sending {qedge_key} query to {self.kp_name} for {input_curie}" ) source_stripped_qnode['id'] = input_curie kp_response = requests.post(self.kp_query_endpoint, json={ 'message': { 'query_graph': { 'nodes': stripped_qnodes, 'edges': { qedge_key: stripped_qedge } } } }, headers={'accept': 'application/json'}) if kp_response.status_code != 200: log.warning( f"{self.kp_name} KP API returned response of {kp_response.status_code}: {kp_response.text}" ) else: kp_response_json = kp_response.json() kp_message = kp_response_json["message"] if kp_message.get('results'): if not combined_message: combined_message = kp_message else: combined_message['knowledge_graph']['nodes'].update( kp_message['knowledge_graph']['nodes']) combined_message['knowledge_graph']['edges'].update( kp_message['knowledge_graph']['edges']) combined_message['results'] += kp_message['results'] return combined_message
def _validate_and_pre_process_input(self, query_graph, valid_bte_inputs_dict, enforce_directionality): # Make sure we have a valid one-hop query graph if len(query_graph.edges) != 1 or len(query_graph.nodes) != 2: self.response.error(f"BTE can only accept one-hop query graphs (your QG has {len(query_graph.nodes)} " f"nodes and {len(query_graph.edges)} edges)", error_code="InvalidQueryGraph") return None, None, None qedge = query_graph.edges[0] # Make sure at least one of our qnodes has a curie qnodes_with_curies = [qnode for qnode in query_graph.nodes if qnode.curie] if not qnodes_with_curies: self.response.error(f"Neither qnode for qedge {qedge.id} has a curie specified. BTE requires that at least" f" one of them has a curie. Your query graph is: {query_graph.to_dict()}") return None, None, None # Figure out which query node is input vs. output and validate which qnodes have curies if enforce_directionality: input_qnode = next(qnode for qnode in query_graph.nodes if qnode.id == qedge.source_id) output_qnode = next(qnode for qnode in query_graph.nodes if qnode.id == qedge.target_id) else: qnodes_with_curies = [qnode for qnode in query_graph.nodes if qnode.curie] input_qnode = qnodes_with_curies[0] if qnodes_with_curies else None output_qnode = next(qnode for qnode in query_graph.nodes if qnode.id != input_qnode.id) if not input_qnode.curie: self.response.error(f"BTE cannot expand edges with a non-specific (curie-less) source node (source node is:" f" {input_qnode.to_dict()})", error_code="InvalidInput") elif not enforce_directionality: self.response.warning(f"BTE cannot do bidirectional queries; the query for this edge will be directed, " f"going: {input_qnode.id}-->{output_qnode.id}") if self.response.status != 'OK': return None, None, None # Make sure predicate is allowed if qedge.type not in valid_bte_inputs_dict['predicates'] and qedge.type is not None: self.response.error(f"BTE does not accept predicate '{qedge.type}'. Valid options are " f"{valid_bte_inputs_dict['predicates']}", error_code="InvalidInput") return None, None, None # Process qnode types (guess one if none provided, convert to preferred format, make sure allowed) if not input_qnode.type: input_qnode.type = eu.guess_qnode_type(input_qnode.curie, self.response) if not output_qnode.type: output_qnode.type = eu.guess_qnode_type(output_qnode.curie, self.response) input_qnode.type = eu.convert_string_to_pascal_case(input_qnode.type) output_qnode.type = eu.convert_string_to_pascal_case(output_qnode.type) qnodes_missing_type = [qnode.id for qnode in [input_qnode, output_qnode] if not qnode.type] if qnodes_missing_type: self.response.error(f"BTE requires every query node to have a type. QNode(s) missing a type: " f"{', '.join(qnodes_missing_type)}", error_code="InvalidInput") return None, None, None invalid_qnode_types = [qnode.type for qnode in [input_qnode, output_qnode] if qnode.type not in valid_bte_inputs_dict['node_types']] if invalid_qnode_types: self.response.error(f"BTE does not accept QNode type(s): {', '.join(invalid_qnode_types)}. Valid options are" f" {valid_bte_inputs_dict['node_types']}", error_code="InvalidInput") return None, None, None # Make sure our input node curies are in list form and use prefixes BTE prefers input_curie_list = eu.convert_string_or_list_to_list(input_qnode.curie) input_qnode.curie = [eu.convert_curie_to_bte_format(curie) for curie in input_curie_list] return qedge, input_qnode, output_qnode
def _add_answers_to_kg(self, answer_kg: DictKnowledgeGraph, reasoner_std_response: Dict[str, any], input_qnode_id: str, output_qnode_id: str, qedge_id: str, log: Response) -> DictKnowledgeGraph: kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict( reasoner_std_response['results']) if reasoner_std_response['knowledge_graph']['edges']: remapped_node_ids = dict() log.debug( f"Got results back from BTE for this query " f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)" ) for node in reasoner_std_response['knowledge_graph']['nodes']: swagger_node = Node() bte_node_id = node.get('id') swagger_node.name = node.get('name') swagger_node.type = eu.convert_string_or_list_to_list( eu.convert_string_to_snake_case(node.get('type'))) # Map the returned BTE qg_ids back to the original qnode_ids in our query graph bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_id) if bte_qg_id == "n0": qnode_id = input_qnode_id elif bte_qg_id == "n1": qnode_id = output_qnode_id else: log.error("Could not map BTE qg_id to ARAX qnode_id", error_code="UnknownQGID") return answer_kg # Find and use the preferred equivalent identifier for this node (if it's an output node) if qnode_id == output_qnode_id: if bte_node_id in remapped_node_ids: swagger_node.id = remapped_node_ids.get(bte_node_id) else: equivalent_curies = [ f"{prefix}:{eu.get_curie_local_id(local_id)}" for prefix, local_ids in node.get( 'equivalent_identifiers').items() for local_id in local_ids ] swagger_node.id = self._get_best_equivalent_bte_curie( equivalent_curies, swagger_node.type[0]) remapped_node_ids[bte_node_id] = swagger_node.id else: swagger_node.id = bte_node_id answer_kg.add_node(swagger_node, qnode_id) for edge in reasoner_std_response['knowledge_graph']['edges']: swagger_edge = Edge() swagger_edge.id = edge.get("id") swagger_edge.type = edge.get('type') swagger_edge.source_id = remapped_node_ids.get( edge.get('source_id'), edge.get('source_id')) swagger_edge.target_id = remapped_node_ids.get( edge.get('target_id'), edge.get('target_id')) swagger_edge.is_defined_by = "BTE" swagger_edge.provided_by = edge.get('edge_source') # Map the returned BTE qg_id back to the original qedge_id in our query graph bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge.id) if bte_qg_id != "e1": log.error("Could not map BTE qg_id to ARAX qedge_id", error_code="UnknownQGID") return answer_kg answer_kg.add_edge(swagger_edge, qedge_id) return answer_kg