def get_canonical_curies_dict(curie: Union[str, List[str]], log: ARAXResponse) -> Dict[str, Dict[str, str]]: curies = convert_string_or_list_to_list(curie) try: synonymizer = NodeSynonymizer() log.debug( f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies" ) canonical_curies_dict = synonymizer.get_canonical_curies(curies) log.debug(f"Got response back from NodeSynonymizer") except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__) return {} else: if canonical_curies_dict is not None: unrecognized_curies = { input_curie for input_curie in canonical_curies_dict if not canonical_curies_dict.get(input_curie) } if unrecognized_curies: log.warning( f"NodeSynonymizer did not return canonical info for: {unrecognized_curies}" ) return canonical_curies_dict else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return {}
def _load_answers_into_kg(self, neo4j_results: List[Dict[str, List[Dict[str, any]]]], kg_name: str, qg: QueryGraph, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: log.debug( f"Processing query results for edge {next(qedge_key for qedge_key in qg.edges)}" ) final_kg = QGOrganizedKnowledgeGraph() node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict( neo4j_results[0]) if kg_name == "KG1" else dict() results_table = neo4j_results[0] column_names = [column_name for column_name in results_table] for column_name in column_names: # Load answer nodes into our knowledge graph if column_name.startswith( 'nodes'): # Example column name: 'nodes_n00' column_qnode_key = column_name.replace("nodes_", "", 1) for neo4j_node in results_table.get(column_name): node_key, node = self._convert_neo4j_node_to_trapi_node( neo4j_node, kg_name) final_kg.add_node(node_key, node, column_qnode_key) # Load answer edges into our knowledge graph elif column_name.startswith( 'edges'): # Example column name: 'edges_e01' column_qedge_key = column_name.replace("edges_", "", 1) for neo4j_edge in results_table.get(column_name): edge_key, edge = self._convert_neo4j_edge_to_trapi_edge( neo4j_edge, node_uuid_to_curie_dict, kg_name) final_kg.add_edge(edge_key, edge, column_qedge_key) return final_kg
def get_canonical_curies_list(curie: Union[str, List[str]], log: ARAXResponse) -> List[str]: curies = convert_to_list(curie) try: synonymizer = NodeSynonymizer() log.debug(f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies") canonical_curies_dict = synonymizer.get_canonical_curies(curies) log.debug(f"Got response back from NodeSynonymizer") except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__) return [] else: if canonical_curies_dict is not None: recognized_input_curies = {input_curie for input_curie in canonical_curies_dict if canonical_curies_dict.get(input_curie)} unrecognized_curies = set(curies).difference(recognized_input_curies) if unrecognized_curies: log.warning(f"NodeSynonymizer did not return canonical info for: {unrecognized_curies}") canonical_curies = {canonical_curies_dict[recognized_curie].get('preferred_curie') for recognized_curie in recognized_input_curies} # Include any original curies we weren't able to find a canonical version for canonical_curies.update(unrecognized_curies) if not canonical_curies: log.error(f"Final list of canonical curies is empty. This shouldn't happen!", error_code="CanonicalCurieIssue") return list(canonical_curies) else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return []
def _add_inverted_predicates(qg: QueryGraph, log: ARAXResponse) -> QueryGraph: # For now, we'll consider BOTH predicates in an inverse pair (TODO: later tailor to what we know is in KG2) qedge = next(qedge for qedge in qg.edges.values()) response = requests.get( "https://raw.githubusercontent.com/biolink/biolink-model/master/biolink-model.yaml" ) if response.status_code == 200: qedge.predicate = eu.convert_to_list(qedge.predicate) biolink_model = yaml.safe_load(response.text) inverse_predicates = set() for predicate in qedge.predicate: english_predicate = predicate.split(":")[-1].replace( "_", " ") # Converts to 'subclass of' format biolink_predicate_info = biolink_model["slots"].get( english_predicate) if biolink_predicate_info and "inverse" in biolink_predicate_info: english_inverse_predicate = biolink_predicate_info[ "inverse"] machine_inverse_predicate = f"biolink:{english_inverse_predicate.replace(' ', '_')}" inverse_predicates.add(machine_inverse_predicate) log.debug( f"Found inverse predicate for {predicate}: {machine_inverse_predicate}" ) qedge.predicate = list( set(qedge.predicate).union(inverse_predicates)) else: log.warning( f"Cannot check for inverse predicates: Failed to load Biolink Model yaml file. " f"(Page gave status {response.status_code}.)") return qg
def _answer_query_using_bte(self, input_qnode_key: str, output_qnode_key: str, qg: QueryGraph, answer_kg: QGOrganizedKnowledgeGraph, valid_bte_inputs_dict: Dict[str, Set[str]], log: ARAXResponse) -> Tuple[QGOrganizedKnowledgeGraph, Set[str]]: accepted_curies = set() qedge_key = next(qedge_key for qedge_key in qg.edges) qedge = qg.edges[qedge_key] input_qnode = qg.nodes[input_qnode_key] output_qnode = qg.nodes[output_qnode_key] # Send this single-edge query to BTE, input curie by input curie (adding findings to our answer KG as we go) for curie in input_qnode.id: # Consider all different combinations of qnode types (can be multiple if gene/protein) for input_qnode_category, output_qnode_category in itertools.product(input_qnode.category, output_qnode.category): if eu.get_curie_prefix(curie) in valid_bte_inputs_dict['curie_prefixes']: accepted_curies.add(curie) try: loop = asyncio.new_event_loop() seqd = SingleEdgeQueryDispatcher(input_cls=input_qnode_category, output_cls=output_qnode_category, pred=qedge.predicate, input_id=eu.get_curie_prefix(curie), values=eu.get_curie_local_id(curie), loop=loop) log.debug(f"Sending query to BTE: {curie}-{qedge.predicate if qedge.predicate else ''}->{output_qnode_category}") seqd.query() reasoner_std_response = seqd.to_reasoner_std() except Exception: trace_back = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered a problem while using BioThings Explorer. {trace_back}", error_code=error_type.__name__) return answer_kg, accepted_curies else: answer_kg = self._add_answers_to_kg(answer_kg, reasoner_std_response, input_qnode_key, output_qnode_key, qedge_key, log) return answer_kg, accepted_curies
def get_preferred_categories(curie: Union[str, List[str]], log: ARAXResponse) -> Optional[List[str]]: curies = convert_to_list(curie) synonymizer = NodeSynonymizer() log.debug( f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies" ) canonical_curies_dict = synonymizer.get_canonical_curies(curies) log.debug(f"Got response back from NodeSynonymizer") if canonical_curies_dict is not None: recognized_input_curies = { input_curie for input_curie in canonical_curies_dict if canonical_curies_dict.get(input_curie) } unrecognized_curies = set(curies).difference(recognized_input_curies) if unrecognized_curies: log.warning( f"NodeSynonymizer did not recognize: {unrecognized_curies}") preferred_categories = { canonical_curies_dict[recognized_curie].get('preferred_category') for recognized_curie in recognized_input_curies } if preferred_categories: return list(preferred_categories) else: log.warning( f"Unable to find any preferred categories; will default to biolink:NamedThing" ) return ["biolink:NamedThing"] else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return []
def apply(self, input_message, input_parameters): #### Define a default response response = ARAXResponse() self.response = response self.message = input_message #### Basic checks on arguments if not isinstance(input_parameters, dict): response.error("Provided parameters is not a dict", error_code="ParametersNotDict") return response #### Define a complete set of allowed parameters and their defaults parameters = { 'maximum_results': None, 'minimum_confidence': None, 'start_node': 1 } #### Loop through the input_parameters and override the defaults and make sure they are allowed for key, value in input_parameters.items(): if key not in parameters: response.error(f"Supplied parameter {key} is not permitted", error_code="UnknownParameter") else: parameters[key] = value #### Return if any of the parameters generated an error (showing not just the first one) if response.status != 'OK': return response #### Store these final parameters for convenience response.data['parameters'] = parameters self.parameters = parameters #### Now apply the filters. Order of operations is probably quite important #### Scalar value filters probably come first like minimum_confidence, then complex logic filters #### based on edge or node properties, and then finally maximum_results response.debug( f"Applying filter to Message with parameters {parameters}") #### First, as a test, blow away the results and see if we can recompute them #message.n_results = 0 #message.results = [] #self.__recompute_results() #### Apply scalar value filters first to do easy things and reduce the problem # TODO #### Complex logic filters probably come next. These may be hard # TODO #### Finally, if the maximum_results parameter is set, then limit the number of results to that last if parameters['maximum_results'] is not None: self.__apply_maximum_results_filter(parameters['maximum_results']) #### Return the response return response
def _convert_one_hop_query_graph_to_cypher_query( self, qg: QueryGraph, enforce_directionality: bool, log: ARAXResponse) -> str: qedge_key = next(qedge_key for qedge_key in qg.edges) qedge = qg.edges[qedge_key] log.debug(f"Generating cypher for edge {qedge_key} query graph") try: # Build the match clause subject_qnode_key = qedge.subject object_qnode_key = qedge.object qedge_cypher = self._get_cypher_for_query_edge( qedge_key, qg, enforce_directionality) source_qnode_cypher = self._get_cypher_for_query_node( subject_qnode_key, qg) target_qnode_cypher = self._get_cypher_for_query_node( object_qnode_key, qg) match_clause = f"MATCH {source_qnode_cypher}{qedge_cypher}{target_qnode_cypher}" # Build the where clause where_fragments = [] for qnode_key in [subject_qnode_key, object_qnode_key]: qnode = qg.nodes[qnode_key] if qnode.id and isinstance(qnode.id, list) and len(qnode.id) > 1: where_fragments.append(f"{qnode_key}.id in {qnode.id}") if qnode.category: qnode.category = eu.convert_to_list(qnode.category) if len(qnode.category) > 1: # Create where fragment that looks like 'n00:biolink:Disease OR n00:biolink:PhenotypicFeature..' category_sub_fragments = [ f"{qnode_key}:`{category}`" for category in qnode.category ] category_where_fragment = f"({' OR '.join(category_sub_fragments)})" where_fragments.append(category_where_fragment) where_clause = f"WHERE {' AND '.join(where_fragments)}" if where_fragments else "" # Build the with clause source_qnode_col_name = f"nodes_{subject_qnode_key}" target_qnode_col_name = f"nodes_{object_qnode_key}" qedge_col_name = f"edges_{qedge_key}" # This line grabs the edge's ID and a record of which of its nodes correspond to which qnode ID extra_edge_properties = "{.*, " + f"id:ID({qedge_key}), {subject_qnode_key}:{subject_qnode_key}.id, {object_qnode_key}:{object_qnode_key}.id" + "}" with_clause = f"WITH collect(distinct {subject_qnode_key}) as {source_qnode_col_name}, " \ f"collect(distinct {object_qnode_key}) as {target_qnode_col_name}, " \ f"collect(distinct {qedge_key}{extra_edge_properties}) as {qedge_col_name}" # Build the return clause return_clause = f"RETURN {source_qnode_col_name}, {target_qnode_col_name}, {qedge_col_name}" cypher_query = f"{match_clause} {where_clause} {with_clause} {return_clause}" return cypher_query except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Problem generating cypher for query. {tb}", error_code=error_type.__name__) return ""
def _send_query_to_kp(self, query_graph: QueryGraph, log: ARAXResponse) -> Dict[str, any]: # Send query to their API (stripping down qnode/qedges to only the properties they like) stripped_qnodes = [] for qnode_key, qnode in query_graph.nodes.items(): stripped_qnode = {'id': qnode_key, 'type': qnode.category} if qnode.id: stripped_qnode['curie'] = qnode.id stripped_qnodes.append(stripped_qnode) qedge_key = next(qedge_key for qedge_key in query_graph.edges) # Our query graph is single-edge qedge = query_graph.edges[qedge_key] stripped_qedge = { 'id': qedge_key, 'source_id': qedge.subject, 'target_id': qedge.object, 'type': list(self.accepted_edge_types)[0] } source_stripped_qnode = next(qnode for qnode in stripped_qnodes if qnode['id'] == qedge.subject) input_curies = eu.convert_string_or_list_to_list( source_stripped_qnode['curie']) combined_response = dict() for input_curie in input_curies: # Until we have batch querying, ping them one-by-one for each input curie log.debug( f"Sending {qedge_key} query to {self.kp_name} for {input_curie}" ) source_stripped_qnode['curie'] = input_curie kp_response = requests.post(self.kp_query_endpoint, json={ 'message': { 'query_graph': { 'nodes': stripped_qnodes, 'edges': [stripped_qedge] } } }, headers={'accept': 'application/json'}) if kp_response.status_code != 200: log.warning( f"{self.kp_name} KP API returned response of {kp_response.status_code}" ) else: kp_response_json = kp_response.json() if kp_response_json.get('results'): if not combined_response: combined_response = kp_response_json else: combined_response['knowledge_graph'][ 'nodes'] += kp_response_json['knowledge_graph'][ 'nodes'] combined_response['knowledge_graph'][ 'edges'] += kp_response_json['knowledge_graph'][ 'edges'] combined_response['results'] += kp_response_json[ 'results'] return combined_response
def _add_answers_to_kg(self, answer_kg: QGOrganizedKnowledgeGraph, reasoner_std_response: Dict[str, any], input_qnode_key: str, output_qnode_key: str, qedge_key: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict(reasoner_std_response['results']) if reasoner_std_response['knowledge_graph']['edges']: remapped_node_keys = dict() log.debug(f"Got results back from BTE for this query " f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)") for node in reasoner_std_response['knowledge_graph']['nodes']: swagger_node = Node() bte_node_key = node.get('id') swagger_node.name = node.get('name') swagger_node.category = eu.convert_to_list(eu.convert_string_to_snake_case(node.get('type'))) # Map the returned BTE qg_ids back to the original qnode_keys in our query graph bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_key) if bte_qg_id == "n0": qnode_key = input_qnode_key elif bte_qg_id == "n1": qnode_key = output_qnode_key else: log.error("Could not map BTE qg_id to ARAX qnode_key", error_code="UnknownQGID") return answer_kg # Find and use the preferred equivalent identifier for this node (if it's an output node) if qnode_key == output_qnode_key: if bte_node_key in remapped_node_keys: swagger_node_key = remapped_node_keys.get(bte_node_key) else: equivalent_curies = [f"{prefix}:{eu.get_curie_local_id(local_id)}" for prefix, local_ids in node.get('equivalent_identifiers').items() for local_id in local_ids] swagger_node_key = self._get_best_equivalent_bte_curie(equivalent_curies, swagger_node.category[0]) remapped_node_keys[bte_node_key] = swagger_node_key else: swagger_node_key = bte_node_key answer_kg.add_node(swagger_node_key, swagger_node, qnode_key) for edge in reasoner_std_response['knowledge_graph']['edges']: swagger_edge = Edge() swagger_edge_key = edge.get("id") swagger_edge.predicate = edge.get('type') swagger_edge.subject = remapped_node_keys.get(edge.get('source_id'), edge.get('source_id')) swagger_edge.object = remapped_node_keys.get(edge.get('target_id'), edge.get('target_id')) swagger_edge.attributes = [Attribute(name="provided_by", value=edge.get('edge_source'), type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="BTE", type=eu.get_attribute_type("is_defined_by"))] # Map the returned BTE qg_id back to the original qedge_key in our query graph bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge_key) if bte_qg_id != "e1": log.error("Could not map BTE qg_id to ARAX qedge_key", error_code="UnknownQGID") return answer_kg answer_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) return answer_kg
def get_curie_names(curie: Union[str, List[str]], log: ARAXResponse) -> Dict[str, str]: curies = convert_to_list(curie) synonymizer = NodeSynonymizer() log.debug( f"Looking up names for {len(curies)} input curies using NodeSynonymizer" ) synonymizer_info = synonymizer.get_normalizer_results(curies) curie_to_name_map = dict() if synonymizer_info: recognized_input_curies = { input_curie for input_curie in synonymizer_info if synonymizer_info.get(input_curie) } unrecognized_curies = set(curies).difference(recognized_input_curies) if unrecognized_curies: log.warning( f"NodeSynonymizer did not recognize: {unrecognized_curies}") input_curies_without_matching_node = set() for input_curie in recognized_input_curies: equivalent_nodes = synonymizer_info[input_curie]["nodes"] # Find the 'node' in the synonymizer corresponding to this curie input_curie_nodes = [ node for node in equivalent_nodes if node["identifier"] == input_curie ] if not input_curie_nodes: # Try looking for slight variation (KG2 vs. SRI discrepancy): "KEGG:C02700" vs. "KEGG.COMPOUND:C02700" input_curie_stripped = input_curie.replace(".COMPOUND", "") input_curie_nodes = [ node for node in equivalent_nodes if node["identifier"] == input_curie_stripped ] # Record the name for this input curie if input_curie_nodes: curie_to_name_map[input_curie] = input_curie_nodes[0].get( "label") else: input_curies_without_matching_node.add(input_curie) if input_curies_without_matching_node: log.warning( f"No matching nodes found in NodeSynonymizer for these input curies: " f"{input_curies_without_matching_node}. Cannot determine their specific names." ) else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return curie_to_name_map
def _answer_query_using_plover( qg: QueryGraph, log: ARAXResponse ) -> Tuple[Dict[str, Dict[str, Set[Union[str, int]]]], int]: rtxc = RTXConfiguration() rtxc.live = "Production" log.debug(f"Sending query to Plover") response = requests.post(f"{rtxc.plover_url}/query", json=qg.to_dict(), headers={'accept': 'application/json'}) if response.status_code == 200: log.debug(f"Got response back from Plover") return response.json(), response.status_code else: log.warning( f"Plover returned a status code of {response.status_code}. Response was: {response.text}" ) return dict(), response.status_code
def _load_answers_into_kg( self, neo4j_results: List[Dict[str, List[Dict[str, any]]]], kg_name: str, qg: QueryGraph, log: ARAXResponse ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]: log.debug( f"Processing query results for edge {next(qedge_key for qedge_key in qg.edges)}" ) final_kg = QGOrganizedKnowledgeGraph() edge_to_nodes_map = dict() node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict( neo4j_results[0]) if kg_name == "KG1" else dict() results_table = neo4j_results[0] column_names = [column_name for column_name in results_table] for column_name in column_names: # Load answer nodes into our knowledge graph if column_name.startswith( 'nodes'): # Example column name: 'nodes_n00' column_qnode_key = column_name.replace("nodes_", "", 1) for neo4j_node in results_table.get(column_name): swagger_node_key, swagger_node = self._convert_neo4j_node_to_swagger_node( neo4j_node, kg_name) final_kg.add_node(swagger_node_key, swagger_node, column_qnode_key) # Load answer edges into our knowledge graph elif column_name.startswith( 'edges'): # Example column name: 'edges_e01' column_qedge_key = column_name.replace("edges_", "", 1) for neo4j_edge in results_table.get(column_name): swagger_edge_key, swagger_edge = self._convert_neo4j_edge_to_swagger_edge( neo4j_edge, node_uuid_to_curie_dict, kg_name) # Record which of this edge's nodes correspond to which qnode_key if swagger_edge_key not in edge_to_nodes_map: edge_to_nodes_map[swagger_edge_key] = dict() for qnode_key in qg.nodes: edge_to_nodes_map[swagger_edge_key][ qnode_key] = neo4j_edge.get(qnode_key) # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, column_qedge_key) return final_kg, edge_to_nodes_map
def _prune_highly_connected_nodes(kg: QGOrganizedKnowledgeGraph, qedge_key: str, input_curies: Set[str], input_qnode_key: str, max_edges_per_input_curie: int, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: # First create a lookup of which edges belong to which input curies input_nodes_to_edges_dict = defaultdict(set) for edge_key, edge in kg.edges_by_qg_id[qedge_key].items(): if edge.subject in input_curies: input_nodes_to_edges_dict[edge.subject].add(edge_key) if edge.object in input_curies: input_nodes_to_edges_dict[edge.object].add(edge_key) # Then prune down highly-connected nodes (delete edges per input curie in excess of some set limit) for node_key, connected_edge_keys in input_nodes_to_edges_dict.items(): connected_edge_keys_list = list(connected_edge_keys) if len(connected_edge_keys_list) > max_edges_per_input_curie: random.shuffle(connected_edge_keys_list) # Make it random which edges we keep for this input curie edge_keys_to_remove = connected_edge_keys_list[max_edges_per_input_curie:] log.debug(f"Randomly removing {len(edge_keys_to_remove)} edges from answer for input curie {node_key}") for edge_key in edge_keys_to_remove: kg.edges_by_qg_id[qedge_key].pop(edge_key, None) # Document that not all answers for this input curie are included node = kg.nodes_by_qg_id[input_qnode_key].get(node_key) if node: if not node.attributes: node.attributes = [] if not any(attribute.attribute_type_id == "biolink:incomplete_result_set" for attribute in node.attributes): node.attributes.append(Attribute(attribute_type_id="biolink:incomplete_result_set", # TODO: request this as actual biolink item? value_type_id="metatype:Boolean", value=True, attribute_source="infores:rtx-kg2", description=f"This attribute indicates that not all " f"nodes/edges returned as answers for this input " f"curie were included in the final answer due to " f"size limitations. {max_edges_per_input_curie} " f"edges for this input curie were kept.")) # Then delete any nodes orphaned by removal of edges node_keys_used_by_edges = kg.get_all_node_keys_used_by_edges() for qnode_key, nodes in kg.nodes_by_qg_id.items(): orphan_node_keys = set(nodes).difference(node_keys_used_by_edges) if orphan_node_keys: log.debug(f"Removing {len(orphan_node_keys)} {qnode_key} nodes orphaned by the above step") for orphan_node_key in orphan_node_keys: del kg.nodes_by_qg_id[qnode_key][orphan_node_key] return kg
def get_node_pairs_to_overlay(subject_qnode_key: str, object_qnode_key: str, query_graph: QueryGraph, knowledge_graph: KnowledgeGraph, log: ARAXResponse) -> Set[Tuple[str, str]]: """ This function determines which combinations of subject/object nodes in the KG need to be overlayed (e.g., have a virtual edge added between). It makes use of Resultify to determine what combinations of subject and object nodes may actually appear together in the same Results. (See issue #1069.) If it fails to narrow the node pairs for whatever reason, it defaults to returning all possible combinations of subject/object nodes. """ log.debug(f"Narrowing down {subject_qnode_key}--{object_qnode_key} node pairs to overlay") kg_nodes_by_qg_id = get_node_ids_by_qg_id(knowledge_graph) kg_edges_by_qg_id = get_edge_ids_by_qg_id(knowledge_graph) # Grab the portion of the QG already 'expanded' (aka, present in the KG) sub_query_graph = QueryGraph(nodes={key:qnode for key, qnode in query_graph.nodes.items() if key in set(kg_nodes_by_qg_id)}, edges={key:qedge for key, qedge in query_graph.edges.items() if key in set(kg_edges_by_qg_id)}) # Compute results using Resultify so we can see which nodes appear in the same results resultifier = ARAXResultify() sub_response = ARAXResponse() sub_response.envelope = Response() sub_response.envelope.message = Message() sub_message = sub_response.envelope.message sub_message.query_graph = sub_query_graph sub_message.knowledge_graph = KnowledgeGraph(nodes=knowledge_graph.nodes.copy(), edges=knowledge_graph.edges.copy()) #sub_response.envelope.message = sub_message resultify_response = resultifier.apply(sub_response, {}) # Figure out which node pairs appear together in one or more results if resultify_response.status == 'OK': node_pairs = set() for result in sub_message.results: subject_curies_in_this_result = {node_binding.id for key, node_binding_list in result.node_bindings.items() for node_binding in node_binding_list if key == subject_qnode_key} object_curies_in_this_result = {node_binding.id for key, node_binding_list in result.node_bindings.items() for node_binding in node_binding_list if key == object_qnode_key} pairs_in_this_result = set(itertools.product(subject_curies_in_this_result, object_curies_in_this_result)) node_pairs = node_pairs.union(pairs_in_this_result) log.debug(f"Identified {len(node_pairs)} node pairs to overlay (with help of resultify)") if node_pairs: return node_pairs # Back up to using the old (O(n^2)) method of all combinations of subject/object nodes in the KG log.warning(f"Failed to narrow down node pairs to overlay; defaulting to all possible combinations") return set(itertools.product(kg_nodes_by_qg_id[subject_qnode_key], kg_nodes_by_qg_id[object_qnode_key]))
def make_qg_use_supported_prefixes( self, qg: QueryGraph, kp_name: str, log: ARAXResponse) -> Optional[QueryGraph]: for qnode_key, qnode in qg.nodes.items(): if qnode.ids: if kp_name == "infores:rtx-kg2": # Just convert them into canonical curies qnode.ids = eu.get_canonical_curies_list(qnode.ids, log) else: # Otherwise figure out which kind of curies KPs want categories = eu.convert_to_list(qnode.categories) supported_prefixes = self._get_supported_prefixes( categories, kp_name) used_prefixes = { self._get_uppercase_prefix(curie) for curie in qnode.ids } # Only convert curie(s) if any use an unsupported prefix if used_prefixes.issubset(supported_prefixes): self.log.debug( f"{kp_name}: All {qnode_key} curies use prefix(es) {kp_name} supports; no " f"conversion necessary") else: self.log.debug( f"{kp_name}: One or more {qnode_key} curies use a prefix {kp_name} doesn't " f"support; will convert these") converted_curies = self.get_desirable_equivalent_curies( qnode.ids, qnode.categories, kp_name) if converted_curies: log.debug( f"{kp_name}: Converted {qnode_key}'s {len(qnode.ids)} curies to a list of " f"{len(converted_curies)} curies tailored for {kp_name}" ) qnode.ids = converted_curies else: log.info( f"{kp_name} cannot answer the query because no equivalent curies were found " f"with prefixes it supports for qnode {qnode_key}. Original curies were: " f"{qnode.ids}") return None return qg
def _answer_query_using_plover(qg: QueryGraph, log: ARAXResponse) -> Tuple[Dict[str, Dict[str, Union[set, dict]]], int]: rtxc = RTXConfiguration() rtxc.live = "Production" # First prep the query graph (requires some minor additions for Plover) dict_qg = qg.to_dict() dict_qg["include_metadata"] = True # Ask plover to return node/edge objects (not just IDs) dict_qg["respect_predicate_symmetry"] = True # Ignore direction for symmetric predicate, enforce for asymmetric # Allow subclass_of reasoning for qnodes with a small number of curies for qnode in dict_qg["nodes"].values(): if qnode.get("ids") and len(qnode["ids"]) < 5: if "allow_subclasses" not in qnode or qnode["allow_subclasses"] is None: qnode["allow_subclasses"] = True # Then send the actual query response = requests.post(f"{rtxc.plover_url}/query", json=dict_qg, timeout=60, headers={'accept': 'application/json'}) if response.status_code == 200: log.debug(f"Got response back from Plover") return response.json(), response.status_code else: log.warning(f"Plover returned a status code of {response.status_code}. Response was: {response.text}") return dict(), response.status_code
def get_curie_synonyms(curie: Union[str, List[str]], log: ARAXResponse) -> List[str]: curies = convert_string_or_list_to_list(curie) try: synonymizer = NodeSynonymizer() log.debug( f"Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(curies)} curies" ) equivalent_curies_dict = synonymizer.get_equivalent_nodes( curies, kg_name="KG2") log.debug(f"Got response back from NodeSynonymizer") except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__) return [] else: if equivalent_curies_dict is not None: curies_missing_info = { curie for curie in equivalent_curies_dict if not equivalent_curies_dict.get(curie) } if curies_missing_info: log.warning( f"NodeSynonymizer did not find any equivalent curies for: {curies_missing_info}" ) equivalent_curies = { curie for curie_dict in equivalent_curies_dict.values() if curie_dict for curie in curie_dict } all_curies = equivalent_curies.union(set( curies)) # Make sure even curies without synonyms are included return sorted(list(all_curies)) else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return []
def decorate_nodes(self, response: ARAXResponse) -> ARAXResponse: message = response.envelope.message response.debug(f"Decorating nodes with metadata from KG2c") # Get connected to the local KG2c sqlite database connection, cursor = self._connect_to_kg2c_sqlite() # Extract the KG2c nodes from sqlite response.debug(f"Looking up corresponding KG2c nodes in sqlite") node_attributes_ordered = list(self.node_attributes) node_cols_str = ", ".join([ f"N.{property_name}" for property_name in node_attributes_ordered ]) node_keys = set( node_key.replace("'", "''") for node_key in message.knowledge_graph.nodes) # Escape quotes node_keys_str = "','".join( node_keys) # SQL wants ('node1', 'node2') format for string lists sql_query = f"SELECT N.id, {node_cols_str} " \ f"FROM nodes AS N " \ f"WHERE N.id IN ('{node_keys_str}')" cursor.execute(sql_query) rows = cursor.fetchall() cursor.close() connection.close() # Decorate nodes in the KG with info in these KG2c nodes response.debug(f"Adding attributes to nodes in the KG") for row in rows: # First create the attributes for this KG2c node node_id = row[0] trapi_node = message.knowledge_graph.nodes[node_id] kg2c_node_attributes = [] for index, property_name in enumerate(node_attributes_ordered): value = self._load_property( property_name, row[index + 1]) # Add one to account for 'id' column if value: kg2c_node_attributes.append( self.create_attribute(property_name, value)) # Then decorate the TRAPI node with those attributes it doesn't already have existing_attribute_triples = { self._get_attribute_triple(attribute) for attribute in trapi_node.attributes } if trapi_node.attributes else set() novel_attributes = [ attribute for attribute in kg2c_node_attributes if self._get_attribute_triple(attribute) not in existing_attribute_triples ] if trapi_node.attributes: trapi_node.attributes += novel_attributes else: trapi_node.attributes = novel_attributes return response
def _pre_process_query_graph(self, query_graph: QueryGraph, log: ARAXResponse) -> QueryGraph: for qnode_key, qnode in query_graph.nodes.items(): # Convert node types to preferred format and verify we can do this query formatted_qnode_categories = { self.node_category_overrides_for_kp.get( qnode_category, qnode_category) for qnode_category in eu.convert_string_or_list_to_list( qnode.category) } accepted_qnode_categories = formatted_qnode_categories.intersection( self.accepted_node_categories) if not accepted_qnode_categories: log.error( f"{self.kp_name} can only be used for queries involving {self.accepted_node_categories} " f"and QNode {qnode_key} has category '{qnode.category}'", error_code="UnsupportedQueryForKP") return query_graph else: qnode.category = list(accepted_qnode_categories)[0] # Convert curies to equivalent curies accepted by the KP (depending on qnode type) if qnode.id: equivalent_curies = eu.get_curie_synonyms(qnode.id, log) desired_curies = [ curie for curie in equivalent_curies if curie.startswith( f"{self.kp_preferred_prefixes[qnode.category]}:") ] if desired_curies: qnode.id = desired_curies if len( desired_curies) > 1 else desired_curies[0] log.debug( f"Converted qnode {qnode_key} curie to {qnode.id}") else: log.warning( f"Could not convert qnode {qnode_key} curie(s) to preferred prefix ({self.kp_preferred_prefixes[qnode.category]})" ) return query_graph
def _load_plover_answer_into_object_model(self, plover_answer: Dict[str, Dict[str, Union[set, dict]]], log: ARAXResponse) -> QGOrganizedKnowledgeGraph: answer_kg = QGOrganizedKnowledgeGraph() # Load returned nodes into TRAPI object model for qnode_key, nodes in plover_answer["nodes"].items(): num_nodes = len(nodes) log.debug(f"Loading {num_nodes} {qnode_key} nodes into TRAPI object model") start = time.time() for node_key, node_tuple in nodes.items(): node = self._convert_kg2c_plover_node_to_trapi_node(node_tuple) answer_kg.add_node(node_key, node, qnode_key) log.debug(f"Loading {num_nodes} {qnode_key} nodes into TRAPI object model took " f"{round(time.time() - start, 2)} seconds") # Load returned edges into TRAPI object model for qedge_key, edges in plover_answer["edges"].items(): num_edges = len(edges) log.debug(f"Loading {num_edges} edges into TRAPI object model") start = time.time() for edge_key, edge_tuple in edges.items(): edge = self._convert_kg2c_plover_edge_to_trapi_edge(edge_tuple) answer_kg.add_edge(edge_key, edge, qedge_key) log.debug(f"Loading {num_edges} {qedge_key} edges into TRAPI object model took " f"{round(time.time() - start, 2)} seconds") return answer_kg
def assess(self, message): #### Define a default response response = ARAXResponse() self.response = response self.message = message response.debug(f"Assessing the QueryGraph for basic information") #### Get shorter handles query_graph = message.query_graph nodes = query_graph.nodes edges = query_graph.edges #### Store number of nodes and edges self.n_nodes = len(nodes) self.n_edges = len(edges) response.debug(f"Found {self.n_nodes} nodes and {self.n_edges} edges") #### Handle impossible cases if self.n_nodes == 0: response.error( "QueryGraph has 0 nodes. At least 1 node is required", error_code="QueryGraphZeroNodes") return response if self.n_nodes == 1 and self.n_edges > 0: response.error( "QueryGraph may not have edges if there is only one node", error_code="QueryGraphTooManyEdges") return response #if self.n_nodes == 2 and self.n_edges > 1: # response.error("QueryGraph may not have more than 1 edge if there are only 2 nodes", error_code="QueryGraphTooManyEdges") # return response #### Loop through nodes computing some stats node_info = {} self.node_category_map = {} for key, qnode in nodes.items(): node_info[key] = { 'key': key, 'node_object': qnode, 'has_id': False, 'category': qnode.category, 'has_category': False, 'is_set': False, 'n_edges': 0, 'n_links': 0, 'is_connected': False, 'edges': [], 'edge_dict': {} } if qnode.id is not None: node_info[key]['has_id'] = True #### If the user did not specify a category, but there is a curie, try to figure out the category if node_info[key]['category'] is None: synonymizer = NodeSynonymizer() curie = qnode.id curies_list = qnode.id if isinstance(qnode.id, list): curie = qnode.id[0] else: curies_list = [qnode.id] canonical_curies = synonymizer.get_canonical_curies( curies=curies_list, return_all_categories=True) if curie in canonical_curies and 'preferred_type' in canonical_curies[ curie]: node_info[key]['has_category'] = True node_info[key]['category'] = canonical_curies[curie][ 'preferred_type'] if qnode.category is not None: node_info[key]['has_category'] = True #if qnode.is_set is not None: node_info[key]['is_set'] = True if key is None: response.error( "QueryGraph has a node with null key. This is not permitted", error_code="QueryGraphNodeWithNoId") return response #### Remap the node categorys from unsupported to supported if qnode.category is not None: qnode.category = self.remap_node_category(qnode.category) #### Store lookup of categorys warning_counter = 0 if qnode.category is None or (isinstance(qnode.category, list) and len(qnode.category) == 0): if warning_counter == 0: #response.debug("QueryGraph has nodes with no category. This may cause problems with results inference later") pass warning_counter += 1 self.node_category_map['unknown'] = key else: category = qnode.category if isinstance(qnode.category, list): category = qnode.category[ 0] # FIXME this is a hack prior to proper list handling self.node_category_map[category] = key #### Loop through edges computing some stats edge_info = {} self.edge_predicate_map = {} unique_links = {} #### Ignore special informationational edges for now. virtual_edge_predicates = { 'has_normalized_google_distance_with': 1, 'has_fisher_exact_test_p-value_with': 1, 'has_jaccard_index_with': 1, 'probably_treats': 1, 'has_paired_concept_frequency_with': 1, 'has_observed_expected_ratio_with': 1, 'has_chi_square_with': 1 } for key, qedge in edges.items(): predicate = qedge.predicate if isinstance(predicate, list): if len(predicate) == 0: predicate = None else: predicate = predicate[ 0] # FIXME Hack before dealing with predicates as lists! if predicate is not None and predicate in virtual_edge_predicates: continue edge_info[key] = { 'key': key, 'has_predicate': False, 'subject': qedge.subject, 'object': qedge.object, 'predicate': None } if predicate is not None: edge_info[key]['has_predicate'] = True edge_info[key]['predicate'] = predicate if key is None: response.error( "QueryGraph has a edge with null key. This is not permitted", error_code="QueryGraphEdgeWithNoKey") return response #### Create a unique node link string link_string = ','.join(sorted([qedge.subject, qedge.object])) if link_string not in unique_links: node_info[qedge.subject]['n_links'] += 1 node_info[qedge.object]['n_links'] += 1 unique_links[link_string] = 1 #print(link_string) node_info[qedge.subject]['n_edges'] += 1 node_info[qedge.object]['n_edges'] += 1 node_info[qedge.subject]['is_connected'] = True node_info[qedge.object]['is_connected'] = True #node_info[qedge.subject]['edges'].append(edge_info[key]) #node_info[qedge.object]['edges'].append(edge_info[key]) node_info[qedge.subject]['edges'].append(edge_info[key]) node_info[qedge.object]['edges'].append(edge_info[key]) node_info[qedge.subject]['edge_dict'][key] = edge_info[key] node_info[qedge.object]['edge_dict'][key] = edge_info[key] #### Store lookup of predicates warning_counter = 0 edge_predicate = 'any' if predicate is None: if warning_counter == 0: response.debug( "QueryGraph has edges with no predicate. This may cause problems with results inference later" ) warning_counter += 1 else: edge_predicate = predicate #### It's not clear yet whether we need to store the whole sentence or just the predicate #predicate_encoding = f"{node_info[qedge.subject]['predicate']}---{edge_predicate}---{node_info[qedge.object]['predicate']}" predicate_encoding = edge_predicate self.edge_predicate_map[predicate_encoding] = key #### Loop through the nodes again, trying to identify the start_node and the end_node singletons = [] for node_id, node_data in node_info.items(): if node_data['n_links'] < 2: singletons.append(node_data) elif node_data['n_links'] > 2: self.is_bifurcated_graph = True response.warning( "QueryGraph appears to have a fork in it. This might cause trouble" ) #### If this doesn't produce any singletons, then try curie based selection if len(singletons) == 0: for node_id, node_data in node_info.items(): if node_data['has_id']: singletons.append(node_data) #### If this doesn't produce any singletons, then we don't know how to continue if len(singletons) == 0: response.error("Unable to understand the query graph", error_code="QueryGraphCircular") return response #### Try to identify the start_node and the end_node start_node = singletons[0] if len(nodes) == 1: # Just a single node, fine pass elif len(singletons) < 2: response.warning( "QueryGraph appears to be circular or has a strange geometry. This might cause trouble" ) elif len(singletons) > 2: response.warning( "QueryGraph appears to have a fork in it. This might cause trouble" ) else: if singletons[0]['has_id'] is True and singletons[1][ 'has_id'] is False: start_node = singletons[0] elif singletons[0]['has_id'] is False and singletons[1][ 'has_id'] is True: start_node = singletons[1] else: start_node = singletons[0] #### Hmm, that's not very robust against odd graphs. This needs work. FIXME self.node_info = node_info self.edge_info = edge_info self.start_node = start_node current_node = start_node node_order = [start_node] edge_order = [] edges = current_node['edges'] debug = False while 1: if debug: tmp = { 'astate': '1', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges } print( json.dumps(ast.literal_eval(repr(tmp)), sort_keys=True, indent=2)) print( '==================================================================================' ) tmp = input() if len(edges) == 0: break #if len(edges) > 1: if current_node['n_links'] > 1: response.error( f"Help, two edges at A583. Don't know what to do: {current_node['n_links']}", error_code="InteralErrorA583") return response edge_order.append(edges[0]) previous_node = current_node if edges[0]['subject'] == current_node['key']: current_node = node_info[edges[0]['object']] elif edges[0]['object'] == current_node['key']: current_node = node_info[edges[0]['subject']] else: response.error("Help, edge error A584. Don't know what to do", error_code="InteralErrorA584") return response node_order.append(current_node) #tmp = { 'astate': '2', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges } #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2)) #print('==================================================================================') #tmp = input() edges = current_node['edges'] new_edges = [] for edge in edges: key = edge['key'] if key not in previous_node['edge_dict']: new_edges.append(edge) edges = new_edges if len(edges) == 0: break #tmp = { 'astate': '3', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges } #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2)) #print('==================================================================================') #tmp = input() self.node_order = node_order self.edge_order = edge_order # Create a text rendering of the QueryGraph geometry for matching against a template self.query_graph_templates = { 'simple': '', 'detailed': { 'n_nodes': len(node_order), 'components': [] } } node_index = 0 edge_index = 0 #print(json.dumps(ast.literal_eval(repr(node_order)),sort_keys=True,indent=2)) for node in node_order: component_id = f"n{node_index:02}" content = '' component = { 'component_type': 'node', 'component_id': component_id, 'has_id': node['has_id'], 'has_category': node['has_category'], 'category_value': None } self.query_graph_templates['detailed']['components'].append( component) if node['has_id']: content = 'id' elif node['has_category'] and node[ 'node_object'].category is not None: content = f"category={node['node_object'].category}" component['category_value'] = node['node_object'].category elif node['has_category']: content = 'category' template_part = f"{component_id}({content})" self.query_graph_templates['simple'] += template_part # Since queries with intermediate nodes that are not is_set=true tend to blow up, for now, make them is_set=true unless explicitly set to false if node_index > 0 and node_index < (self.n_nodes - 1): if 'is_set' not in node or node['is_set'] is None: node['node_object'].is_set = True response.warning( f"Setting unspecified is_set to true for {node['key']} because this will probably lead to a happier result" ) elif node['is_set'] is True: response.debug( f"Value for is_set is already true for {node['key']} so that's good" ) elif node['is_set'] is False: #response.info(f"Value for is_set is set to false for intermediate node {node['key']}. This could lead to weird results. Consider setting it to true") response.info( f"Value for is_set is false for intermediate node {node['key']}. Setting to true because this will probably lead to a happier result" ) node['node_object'].is_set = True #else: # response.error(f"Unrecognized value is_set='{node['is_set']}' for {node['key']}. This should be true or false") node_index += 1 if node_index < self.n_nodes: #print(json.dumps(ast.literal_eval(repr(node)),sort_keys=True,indent=2)) #### Extract the has_predicate and predicate_value from the edges of the node #### This could fail if there are two edges coming out of the node FIXME has_predicate = False predicate_value = None if 'edges' in node: for related_edge in node['edges']: if related_edge['subject'] == node['key']: has_predicate = related_edge['has_predicate'] if has_predicate is True and 'predicate' in related_edge: predicate_value = related_edge['predicate'] component_id = f"e{edge_index:02}" template_part = f"-{component_id}()-" self.query_graph_templates['simple'] += template_part component = { 'component_type': 'edge', 'component_id': component_id, 'has_id': False, 'has_predicate': has_predicate, 'predicate_value': predicate_value } self.query_graph_templates['detailed']['components'].append( component) edge_index += 1 response.debug( f"The QueryGraph reference template is: {self.query_graph_templates['simple']}" ) #tmp = { 'node_info': node_info, 'edge_info': edge_info, 'start_node': start_node, 'n_nodes': self.n_nodes, 'n_edges': self.n_edges, # 'is_bifurcated_graph': self.is_bifurcated_graph, 'node_order': node_order, 'edge_order': edge_order } #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2)) #sys.exit(0) #### Return the response return response
def _grab_nodes_and_edges_from_sqlite( self, plover_answer: Dict[str, Dict[str, Set[Union[str, int]]]], kg_name: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: # Get connected to the local sqlite database (look up its path using database manager-friendly method) path_list = os.path.realpath(__file__).split(os.path.sep) rtx_index = path_list.index("RTX") rtxc = RTXConfiguration() sqlite_dir_path = os.path.sep.join([ *path_list[:(rtx_index + 1)], 'code', 'ARAX', 'KnowledgeSources', 'KG2c' ]) sqlite_name = rtxc.kg2c_sqlite_path.split('/')[-1] sqlite_file_path = f"{sqlite_dir_path}{os.path.sep}{sqlite_name}" connection = sqlite3.connect(sqlite_file_path) cursor = connection.cursor() answer_kg = QGOrganizedKnowledgeGraph() # Grab the node objects from sqlite corresponding to the returned node IDs num_nodes = sum( [len(nodes) for nodes in plover_answer["nodes"].values()]) start = time.time() for qnode_key, node_keys in plover_answer["nodes"].items(): node_keys_str = "','".join( node_keys ) # SQL wants ('node1', 'node2') format for string lists sql_query = f"SELECT N.node " \ f"FROM nodes AS N " \ f"WHERE N.id IN ('{node_keys_str}')" log.debug( f"Looking up {len(plover_answer['nodes'][qnode_key])} returned {qnode_key} node IDs in KG2c sqlite" ) cursor.execute(sql_query) rows = cursor.fetchall() for row in rows: node_as_dict = ujson.loads(row[0]) node_key, node = self._convert_neo4j_node_to_trapi_node( node_as_dict, kg_name) answer_kg.add_node(node_key, node, qnode_key) log.debug( f"Grabbing {num_nodes} nodes from sqlite and loading into object model took " f"{round(time.time() - start, 2)} seconds") # Grab the edge objects from sqlite corresponding to the returned edge IDs num_edges = sum( [len(edges) for edges in plover_answer["edges"].values()]) start = time.time() for qedge_key, edge_keys in plover_answer["edges"].items(): edge_keys_str = ",".join( str(edge_key) for edge_key in edge_keys) # SQL wants (1, 2) format int lists sql_query = f"SELECT E.edge " \ f"FROM edges AS E " \ f"WHERE E.id IN ({edge_keys_str})" log.debug( f"Looking up {len(plover_answer['edges'][qedge_key])} returned {qedge_key} edge IDs in KG2c sqlite" ) cursor.execute(sql_query) rows = cursor.fetchall() for row in rows: edge_as_dict = ujson.loads(row[0]) edge_key, edge = self._convert_neo4j_edge_to_trapi_edge( edge_as_dict, dict(), kg_name) answer_kg.add_edge(edge_key, edge, qedge_key) log.debug( f"Grabbing {num_edges} edges from sqlite and loading into object model took " f"{round(time.time() - start, 2)} seconds") cursor.close() connection.close() return answer_kg
def decorate_edges(self, response: ARAXResponse, kind: Optional[str] = "RTX-KG2") -> ARAXResponse: """ Decorates edges with publication sentences and any other available EPC info. kind: The kind of edges to decorate, either: "NGD" or "RTX-KG2". For NGD edges, publications info attributes are added. For RTX-KG2 edges, attributes for all EPC properties are added. """ kg = response.envelope.message.knowledge_graph response.debug(f"Decorating edges with EPC info from KG2c") supported_kinds = {"RTX-KG2", "NGD"} if kind not in supported_kinds: response.error( f"Supported values for ARAXDecorator.decorate_edges()'s 'kind' parameter are: " f"{supported_kinds}") return response # Figure out which edges we need to decorate if kind == "RTX-KG2": edge_keys_to_decorate = { edge_id for edge_id, edge in kg.edges.items() if edge.attributes and any( attribute.value == self.kg2_infores_curie and attribute. attribute_type_id == "biolink:aggregator_knowledge_source" for attribute in edge.attributes) } else: edge_keys_to_decorate = { edge_id for edge_id, edge in kg.edges.items() if edge.predicate == "biolink:has_normalized_google_distance_with" } if not edge_keys_to_decorate: response.debug(f"Could not identify any {kind} edges to decorate") else: response.debug( f"Identified {len(edge_keys_to_decorate)} edges to decorate") # Determine the search keys for these edges that we need to look up in sqlite search_key_to_edge_keys_map = defaultdict(set) if kind == "NGD": # For now only NGD/overlay will use this mode for edge_key in edge_keys_to_decorate: edge = kg.edges[edge_key] search_key = f"{edge.subject}--{edge.object}" search_key_to_edge_keys_map[search_key].add(edge_key) search_key_column = "node_pair" else: # This is the mode used for decorating KG2 edges (or other KPs' edges) for edge_key in edge_keys_to_decorate: edge = kg.edges[edge_key] search_key = f"{edge.subject}--{edge.predicate}--{edge.object}" search_key_to_edge_keys_map[search_key].add(edge_key) search_key_column = "triple" # Extract the proper entries from sqlite connection, cursor = self._connect_to_kg2c_sqlite() response.debug(f"Looking up EPC edge info in KG2c sqlite") edge_attributes_ordered = list(self.edge_attributes) edge_cols_str = ", ".join([ f"E.{property_name}" for property_name in edge_attributes_ordered ]) search_keys_set = set( search_key.replace("'", "''") for search_key in set( search_key_to_edge_keys_map)) # Escape quotes search_keys_str = "','".join( search_keys_set ) # SQL wants ('node1', 'node2') format for string lists sql_query = f"SELECT E.{search_key_column}, {edge_cols_str} " \ f"FROM edges AS E " \ f"WHERE E.{search_key_column} IN ('{search_keys_str}')" cursor.execute(sql_query) rows = cursor.fetchall() cursor.close() connection.close() response.debug(f"Got {len(rows)} rows back from KG2c sqlite") response.debug(f"Adding attributes to edges in the KG") # Create a helper lookup map for easy access to returned rows search_key_to_kg2c_edge_tuples_map = defaultdict(list) for row in rows: search_key = row[0] search_key_to_kg2c_edge_tuples_map[search_key].append(row) attribute_type_id_map = { property_name: self.create_attribute(property_name, "something").attribute_type_id for property_name in set(self.edge_attributes).difference( {"knowledge_source"}) } for search_key, kg2c_edge_tuples in search_key_to_kg2c_edge_tuples_map.items( ): # Join the property values found for all edges matching the given search key merged_kg2c_properties = { property_name: None for property_name in edge_attributes_ordered } for kg2c_edge_tuple in kg2c_edge_tuples: for index, property_name in enumerate(edge_attributes_ordered): raw_value = kg2c_edge_tuple[index + 1] if raw_value: # Skip empty attributes value = self._load_property(property_name, raw_value) if not merged_kg2c_properties.get(property_name): merged_kg2c_properties[property_name] = set( ) if isinstance(value, list) else dict() if isinstance(value, list): merged_kg2c_properties[property_name].update( set(value)) else: merged_kg2c_properties[property_name].update(value) joined_knowledge_sources = list( merged_kg2c_properties["knowledge_source"] ) if merged_kg2c_properties.get("knowledge_source") else set() knowledge_source = joined_knowledge_sources[0] if len( joined_knowledge_sources) == 1 else None joined_kg2_ids = list( merged_kg2c_properties["kg2_ids"] ) if merged_kg2c_properties.get("kg2_ids") else set() joined_publications = list( merged_kg2c_properties["publications"] ) if merged_kg2c_properties.get("publications") else set() joined_publications_info = merged_kg2c_properties[ "publications_info"] if merged_kg2c_properties.get( "publications_info") else dict() # Add the joined attributes to each of the edges with the given search key (as needed) corresponding_bare_edge_keys = search_key_to_edge_keys_map[ search_key] for edge_key in corresponding_bare_edge_keys: bare_edge = kg.edges[edge_key] existing_attribute_type_ids = { attribute.attribute_type_id for attribute in bare_edge.attributes } if bare_edge.attributes else set() new_attributes = [] # Create KG2 edge-specific attributes if kind == "RTX-KG2": if attribute_type_id_map[ "kg2_ids"] not in existing_attribute_type_ids: new_attributes.append( self.create_attribute("kg2_ids", list(joined_kg2_ids))) if joined_publications and attribute_type_id_map[ "publications"] not in existing_attribute_type_ids: new_attributes.append( self.create_attribute( "publications", list(joined_publications), attribute_source=knowledge_source)) # Create attributes that belong on both KG2 and NGD edges if joined_publications_info and attribute_type_id_map[ "publications_info"] not in existing_attribute_type_ids: new_attributes.append( self.create_attribute( "publications_info", joined_publications_info, attribute_source=knowledge_source)) # Actually tack the new attributes onto the edge if new_attributes: if not bare_edge.attributes: bare_edge.attributes = new_attributes else: bare_edge.attributes += new_attributes return response
def parse(self, input_actions): #### Define a default response response = ARAXResponse() response.info(f"Parsing input actions list") #### Basic error checking of the input_actions if not isinstance(input_actions, list): response.error("Provided input actions is not a list", error_code="ActionsNotList") return response if len(input_actions) == 0: response.error("Provided input actions is an empty list", error_code="ActionsListEmpty") return response #### Iterate through the list, checking the items actions = [] n_lines = 1 for action in input_actions: response.debug(f"Parsing action: {action}") # If this line is empty, then skip match = re.match(r"\s*$", action) if match: continue # If this line begins with a #, it is a comment, then skip match = re.match(r"#", action) if match: continue #### First look for a naked command without parentheses match = re.match(r"\s*([A-Za-z_]+)\s*$", action) if match is not None: action = { "line": n_lines, "command": match.group(1), "parameters": None } actions.append(action) #### Then look for and parse a command with parentheses and a comma-separated parameter list if match is None: match = re.match(r"\s*([A-Za-z_]+)\((.*)\)\s*$", action) if match is not None: command = match.group(1) param_string = match.group(2) #### Split the parameters on comma and process those param_string_list = re.split(",", param_string) parameters = {} #### If a value is of the form key=[value1,value2] special code is needed to recompose that mode = 'normal' list_buffer = [] key = '' for param_item in param_string_list: param_item = param_item.strip() if mode == 'normal': #### Split on the first = only (might be = in the value) values = re.split("=", param_item, 1) key = values[0] #### If there isn't a value after an =, then just set to string true value = 'true' if len(values) > 1: value = values[1] key = key.strip() value = value.strip() #### If the value begins with a "[", then this is a list match = re.match(r"\[(.+)$", value) if match: #### If it also ends with a "]", then this is a list of one element match2 = re.match(r"\[(.*)\]$", value) if match2: if match2.group(1) == '': parameters[key] = [] else: parameters[key] = [match2.group(1)] else: mode = 'in_list' list_buffer = [match.group(1)] else: parameters[key] = value #### Special processing if we're in the middle of a list elif mode == 'in_list': match = re.match(r"(.*)\]$", param_item) if match: mode = 'normal' list_buffer.append(match.group(1)) parameters[key] = list_buffer else: list_buffer.append(param_item) else: eprint("Inconceivable!") if mode == 'in_list': parameters[key] = list_buffer #### Store the parsed result in a dict and add to the list action = { "line": n_lines, "command": command, "parameters": parameters } actions.append(action) else: response.error(f"Unable to parse action {action}", error_code="ActionsListEmpty") n_lines += 1 #### Put the actions in the response data envelope and return response.data["actions"] = actions return response
def _convert_one_hop_query_graph_to_cypher_query( self, qg: QueryGraph, enforce_directionality: bool, kg_name: str, log: ARAXResponse) -> str: qedge_key = next(qedge_key for qedge_key in qg.edges) qedge = qg.edges[qedge_key] log.debug(f"Generating cypher for edge {qedge_key} query graph") try: # Build the match clause subject_qnode_key = qedge.subject object_qnode_key = qedge.object qedge_cypher = self._get_cypher_for_query_edge( qedge_key, qg, enforce_directionality) source_qnode_cypher = self._get_cypher_for_query_node( subject_qnode_key, qg, kg_name) target_qnode_cypher = self._get_cypher_for_query_node( object_qnode_key, qg, kg_name) match_clause = f"MATCH {source_qnode_cypher}{qedge_cypher}{target_qnode_cypher}" # Build the where clause where_fragments = [] for qnode_key in [subject_qnode_key, object_qnode_key]: qnode = qg.nodes[qnode_key] if qnode.id and isinstance(qnode.id, list) and len(qnode.id) > 1: where_fragments.append(f"{qnode_key}.id in {qnode.id}") if qnode.category: # Only inspect the 'all_categories' field if we're using KG2c if kg_name == "KG2c": category_fragments = [ f"'{category}' in {qnode_key}.types" for category in qnode.category ] joined_category_fragments = " OR ".join( category_fragments) category_where_clause = joined_category_fragments if len( category_fragments ) < 2 else f"({joined_category_fragments})" where_fragments.append(category_where_clause) # Otherwise add a simple where condition if we have multiple categories elif len(qnode.category) > 1: if kg_name == "KG2": node_category_property = "category_label" else: node_category_property = "category" where_fragments.append( f"{qnode_key}.{node_category_property} in {qnode.category}" ) if where_fragments: where_clause = f"WHERE {' AND '.join(where_fragments)}" else: where_clause = "" # Build the with clause source_qnode_col_name = f"nodes_{subject_qnode_key}" target_qnode_col_name = f"nodes_{object_qnode_key}" qedge_col_name = f"edges_{qedge_key}" # This line grabs the edge's ID and a record of which of its nodes correspond to which qnode ID extra_edge_properties = "{.*, " + f"id:ID({qedge_key}), {subject_qnode_key}:{subject_qnode_key}.id, {object_qnode_key}:{object_qnode_key}.id" + "}" with_clause = f"WITH collect(distinct {subject_qnode_key}) as {source_qnode_col_name}, " \ f"collect(distinct {object_qnode_key}) as {target_qnode_col_name}, " \ f"collect(distinct {qedge_key}{extra_edge_properties}) as {qedge_col_name}" # Build the return clause return_clause = f"RETURN {source_qnode_col_name}, {target_qnode_col_name}, {qedge_col_name}" cypher_query = f"{match_clause} {where_clause} {with_clause} {return_clause}" return cypher_query except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Problem generating cypher for query. {tb}", error_code=error_type.__name__) return ""
def _answer_query_using_CHP_client( self, query_graph: QueryGraph, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: qedge_key = next(qedge_key for qedge_key in query_graph.edges) log.debug( f"Processing query results for edge {qedge_key} by using CHP client" ) final_kg = QGOrganizedKnowledgeGraph() gene_label_list = ['gene'] drug_label_list = ['drug', 'chemicalsubstance'] # use for checking the requirement source_pass_nodes = None source_category = None target_pass_nodes = None target_category = None qedge = query_graph.edges[qedge_key] source_qnode_key = qedge.subject target_qnode_key = qedge.object source_qnode = query_graph.nodes[source_qnode_key] target_qnode = query_graph.nodes[target_qnode_key] # check if both ends of edge have no curie if (source_qnode.id is None) and (target_qnode.id is None): log.error(f"Both ends of edge {qedge_key} are None", error_code="BadEdge") return final_kg # check if the query nodes are drug or disease if source_qnode.id is not None: if type(source_qnode.id) is str: source_pass_nodes = [source_qnode.id] else: source_pass_nodes = source_qnode.id has_error, pass_nodes, not_pass_nodes = self._check_id( source_qnode.id, log) if has_error: return final_kg else: if len(not_pass_nodes) == 0 and len(pass_nodes) != 0: source_pass_nodes = pass_nodes elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0: source_pass_nodes = pass_nodes if len(not_pass_nodes) == 1: log.warning( f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client" ) else: log.warning( f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client" ) else: if type(source_qnode.id) is str: log.error( f"The curie id of {source_qnode.id} is not allowable based on CHP client", error_code="NotAllowable") return final_kg else: log.error( f"The curie ids of {source_qnode.id} are not allowable based on CHP client", error_code="NotAllowable") return final_kg else: category = source_qnode.category[0].replace( 'biolink:', '').replace('_', '').lower() source_category = category if (category in drug_label_list) or (category in gene_label_list): source_category = category else: log.error( f"The category of query node {source_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene", error_code="CategoryError") return final_kg if target_qnode.id is not None: if type(target_qnode.id) is str: target_pass_nodes = [target_qnode.id] else: target_pass_nodes = target_qnode.id has_error, pass_nodes, not_pass_nodes = self._check_id( target_qnode.id, log) if has_error: return final_kg else: if len(not_pass_nodes) == 0 and len(pass_nodes) != 0: target_pass_nodes = pass_nodes elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0: target_pass_nodes = pass_nodes if len(not_pass_nodes) == 1: log.warning( f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client" ) else: log.warning( f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client" ) else: if type(target_qnode.id) is str: log.error( f"The curie id of {target_qnode.id} is not allowable based on CHP client", error_code="CategoryError") return final_kg else: log.error( f"The curie ids of {target_qnode.id} are not allowable based on CHP client", error_code="CategoryError") return final_kg else: category = target_qnode.category[0].replace( 'biolink:', '').replace('_', '').lower() target_category = category if (category in drug_label_list) or (category in gene_label_list): target_category = category else: log.error( f"The category of query node {target_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene", error_code="CategoryError") return final_kg if (source_pass_nodes is None) and (target_pass_nodes is None): return final_kg elif (source_pass_nodes is not None) and (target_pass_nodes is not None): source_dict = dict() target_dict = dict() if source_pass_nodes[0] in self.allowable_drug_curies: source_category_temp = 'drug' else: source_category_temp = 'gene' if target_pass_nodes[0] in self.allowable_drug_curies: target_category_temp = 'drug' else: target_category_temp = 'gene' if source_category_temp == target_category_temp: log.error( f"The query nodes in both ends of edge are the same type which is {source_category_temp}", error_code="CategoryError") return final_kg else: for (source_curie, target_curie) in itertools.product( source_pass_nodes, target_pass_nodes): if source_category_temp == 'drug': source_curie_temp = source_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') # Let's build a simple single query q = build_query(genes=[target_curie], therapeutic=source_curie_temp, disease='MONDO:0007254', outcome=('EFO:0000714', '>=', self.CHP_survival_threshold)) response = self.client.query(q) max_probability = self.client.get_outcome_prob( response) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( target_curie, source_curie, "paired_with", max_probability) else: target_curie_temp = target_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') # Let's build a simple single query q = build_query(genes=[source_curie], therapeutic=target_curie_temp, disease='MONDO:0007254', outcome=('EFO:0000714', '>=', self.CHP_survival_threshold)) response = self.client.query(q) max_probability = self.client.get_outcome_prob( response) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( source_curie, target_curie, "paired_with", max_probability) source_dict[source_curie] = source_qnode_key target_dict[target_curie] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) # Add the nodes to our answer knowledge graph if len(source_dict) != 0: for source_curie in source_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( source_curie) final_kg.add_node(swagger_node_key, swagger_node, source_dict[source_curie]) if len(target_dict) != 0: for target_curie in target_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( target_curie) final_kg.add_node(swagger_node_key, swagger_node, target_dict[target_curie]) return final_kg elif source_pass_nodes is not None: source_dict = dict() target_dict = dict() if source_pass_nodes[0] in self.allowable_drug_curies: source_category_temp = 'drug' else: source_category_temp = 'gene' if target_category in drug_label_list: target_category_temp = 'drug' else: target_category_temp = 'gene' if source_category_temp == target_category_temp: log.error( f"The query nodes in both ends of edge are the same type which is {source_category_temp}", error_code="CategoryError") return final_kg else: if source_category_temp == 'drug': for source_curie in source_pass_nodes: genes = [ curie for curie in self.allowable_gene_curies if self.synonymizer.get_canonical_curies(curie) [curie] is not None and target_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie, return_all_categories=True) [curie]['all_categories'].keys()) ] ] therapeutic = source_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for gene in genes: queries.append( build_query( genes=[gene], therapeutic=therapeutic, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, gene in zip(res["message"], genes): prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( gene, source_curie, "paired_with", prob) source_dict[source_curie] = source_qnode_key target_dict[gene] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) else: for source_curie in source_pass_nodes: genes = [source_curie] therapeutic = [ curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:') for curie in self.allowable_drug_curies if self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')) [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')] is not None and target_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'), return_all_categories=True)[ curie.replace( 'CHEMBL:', 'CHEMBL.COMPOUND:')] ['all_categories'].keys()) ] ] disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for drug in therapeutic: queries.append( build_query( genes=genes, therapeutic=drug, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, drug in zip(res["message"], therapeutic): drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:') prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( source_curie, drug, "paired_with", prob) source_dict[source_curie] = source_qnode_key target_dict[drug] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) # Add the nodes to our answer knowledge graph if len(source_dict) != 0: for source_curie in source_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( source_curie) final_kg.add_node(swagger_node_key, swagger_node, source_dict[source_curie]) if len(target_dict) != 0: for target_curie in target_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( target_curie) final_kg.add_node(swagger_node_key, swagger_node, target_dict[target_curie]) return final_kg else: source_dict = dict() target_dict = dict() if target_pass_nodes[0] in self.allowable_drug_curies: target_category_temp = 'drug' else: target_category_temp = 'gene' if source_category in drug_label_list: source_category_temp = 'drug' else: source_category_temp = 'gene' if source_category_temp == target_category_temp: log.error( f"The query nodes in both ends of edge are the same type which is {source_category_temp}", error_code="CategoryError") return final_kg else: if target_category_temp == 'drug': for target_curie in target_pass_nodes: genes = [ curie for curie in self.allowable_gene_curies if self.synonymizer.get_canonical_curies(curie) [curie] is not None and source_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie, return_all_categories=True) [curie]['all_categories'].keys()) ] ] therapeutic = target_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for gene in genes: queries.append( build_query( genes=[gene], therapeutic=therapeutic, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, gene in zip(res["message"], genes): prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( gene, target_curie, "paired_with", prob) source_dict[gene] = source_qnode_key target_dict[target_curie] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) else: for target_curie in target_pass_nodes: genes = [target_curie] therapeutic = [ curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:') for curie in self.allowable_drug_curies if self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')) [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')] is not None and source_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'), return_all_categories=True)[ curie.replace( 'CHEMBL:', 'CHEMBL.COMPOUND:')] ['all_categories'].keys()) ] ] disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for drug in therapeutic: queries.append( build_query( genes=genes, therapeutic=drug, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, drug in zip(res["message"], therapeutic): drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:') prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( target_curie, drug, "paired_with", prob) source_dict[drug] = source_qnode_key target_dict[target_curie] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) # Add the nodes to our answer knowledge graph if len(source_dict) != 0: for source_curie in source_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( source_curie) final_kg.add_node(swagger_node_key, swagger_node, source_dict[source_curie]) if len(target_dict) != 0: for target_curie in target_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( target_curie) final_kg.add_node(swagger_node_key, swagger_node, target_dict[target_curie]) return final_kg
def create_results( qg: QueryGraph, kg: QGOrganizedKnowledgeGraph, log: ARAXResponse, overlay_fet: bool = False, rank_results: bool = False, qnode_key_to_prune: Optional[str] = None, ) -> Response: regular_format_kg = convert_qg_organized_kg_to_standard_kg(kg) resultifier = ARAXResultify() prune_response = ARAXResponse() prune_response.envelope = Response() prune_response.envelope.message = Message() prune_message = prune_response.envelope.message prune_message.query_graph = qg prune_message.knowledge_graph = regular_format_kg if overlay_fet: log.debug( f"Using FET to assess quality of intermediate answers in Expand") connected_qedges = [ qedge for qedge in qg.edges.values() if qedge.subject == qnode_key_to_prune or qedge.object == qnode_key_to_prune ] qnode_pairs_to_overlay = { (qedge.subject if qedge.subject != qnode_key_to_prune else qedge.object, qnode_key_to_prune) for qedge in connected_qedges } for qnode_pair in qnode_pairs_to_overlay: pair_string_id = f"{qnode_pair[0]}-->{qnode_pair[1]}" log.debug(f"Overlaying FET for {pair_string_id} (from Expand)") fet_qedge_key = f"FET{pair_string_id}" try: overlayer = ARAXOverlay() params = { "action": "fisher_exact_test", "subject_qnode_key": qnode_pair[0], "object_qnode_key": qnode_pair[1], "virtual_relation_label": fet_qedge_key } overlayer.apply(prune_response, params) except Exception as error: exception_type, exception_value, exception_traceback = sys.exc_info( ) log.warning( f"An uncaught error occurred when overlaying with FET during Expand's pruning: {error}: " f"{repr(traceback.format_exception(exception_type, exception_value, exception_traceback))}" ) if prune_response.status != "OK": log.warning( f"FET produced an error when Expand tried to use it to prune the KG. " f"Log was: {prune_response.show()}") log.debug(f"Will continue pruning without overlaying FET") # Get rid of any FET edges that might be in the KG/QG, since this step failed remove_edges_with_qedge_key( prune_response.envelope.message.knowledge_graph, fet_qedge_key) qg.edges.pop(fet_qedge_key, None) prune_response.status = "OK" # Clear this so we can continue without overlaying else: if fet_qedge_key in qg.edges: qg.edges[ fet_qedge_key].option_group_id = f"FET_VIRTUAL_GROUP_{pair_string_id}" else: log.warning( f"Attempted to overlay FET from Expand, but it didn't work. Pruning without it." ) # Create results and rank them as appropriate log.debug(f"Calling Resultify from Expand for pruning") resultifier.apply(prune_response, {}) if rank_results: try: log.debug(f"Ranking Expand's intermediate pruning results") ranker = ARAXRanker() ranker.aggregate_scores_dmk(prune_response) except Exception as error: exception_type, exception_value, exception_traceback = sys.exc_info( ) log.error( f"An uncaught error occurred when attempting to rank results during Expand's pruning: " f"{error}: {repr(traceback.format_exception(exception_type, exception_value, exception_traceback))}." f"Log was: {prune_response.show()}", error_code="UncaughtARAXiError") # Give any unranked results a score of 0 for result in prune_response.envelope.message.results: if result.score is None: result.score = 0 return prune_response
def add_query_graph_tags(self, message, query_graph_info): #### Define a default response response = ARAXResponse() self.response = response self.message = message response.debug(f"Adding temporary QueryGraph ids to KnowledgeGraph") #### Get shorter handles knowedge_graph = message.knowledge_graph nodes = knowedge_graph.nodes edges = knowedge_graph.edges #### Loop through nodes adding qnode_ids for key, node in nodes.items(): #### If there is not qnode_id, then determine what it should be and add it if node.qnode_id is None: categorys = node.category #### Find a matching category in the QueryGraph for this node if categorys is None: response.error( f"KnowledgeGraph node {key} does not have a category. This should never be", error_code="NodeMissingCategory") return response n_found_categorys = 0 found_category = None for node_category in categorys: if node_category in query_graph_info.node_category_map: n_found_categorys += 1 found_category = node_category #### If we did not find exactly one matching category, error out if n_found_categorys == 0: response.error( f"Tried to find categorys '{categorys}' for KnowledgeGraph node {key} in query_graph_info, but did not find it", error_code="NodeCategoryMissingInQueryGraph") return response elif n_found_categorys > 1: response.error( f"Tried to find categorys '{categorys}' for KnowledgeGraph node {key} in query_graph_info, and found multiple matches. This is ambiguous", error_code="MultipleNodeCategorysInQueryGraph") return response #### Else add it node.qnode_id = query_graph_info.node_category_map[ found_category] #### Loop through the edges adding qedge_ids for key, edge in edges.items(): #### Check to see if there is already a qedge_id attribute on the edge if edge.qedge_id is None: #### If there isn't a predicate or can't find it in the query_graph, error out if edge.predicate is None: response.error( f"KnowledgeGraph edge {key} does not have a predicate. This should never be", error_code="EdgeMissingPredicate") return response if edge.predicate not in query_graph_info.edge_predicate_map: response.error( f"Tried to find predicate '{edge.predicate}' for KnowledgeGraph node {key} in query_graph_info, but did not find it", error_code="EdgePredicateMissingInQueryGraph") return response #### Else add it edge.qedge_id = query_graph_info.edge_predicate_map[ edge.predicate] #### Return the response return response
def check_for_query_graph_tags(self, message, query_graph_info): #### Define a default response response = ARAXResponse() self.response = response self.message = message response.debug(f"Checking KnowledgeGraph for QueryGraph tags") #### Get shorter handles knowledge_graph = message.knowledge_graph nodes = knowledge_graph.nodes edges = knowledge_graph.edges #### Store number of nodes and edges self.n_nodes = len(nodes) self.n_edges = len(edges) response.debug(f"Found {self.n_nodes} nodes and {self.n_edges} edges") #### Clear the maps self.node_map = {'by_qnode_id': {}} self.edge_map = {'by_qedge_id': {}} #### Loop through nodes computing some stats n_nodes_with_query_graph_ids = 0 for key, node in nodes.items(): if node.qnode_id is None: continue n_nodes_with_query_graph_ids += 1 #### Place an entry in the node_map if node.qnode_id not in self.node_map['by_qnode_id']: self.node_map['by_qnode_id'][node.qnode_id] = {} self.node_map['by_qnode_id'][node.qnode_id][key] = 1 #### Tally the stats if n_nodes_with_query_graph_ids == self.n_nodes: self.query_graph_id_node_status = 'all nodes have query_graph_ids' elif n_nodes_with_query_graph_ids == 0: self.query_graph_id_node_status = 'no nodes have query_graph_ids' else: self.query_graph_id_node_status = 'only some nodes have query_graph_ids' response.info( f"In the KnowledgeGraph, {self.query_graph_id_node_status}") #### Loop through edges computing some stats n_edges_with_query_graph_ids = 0 for key, edge in edges.items(): if edge.qedge_id is None: continue n_edges_with_query_graph_ids += 1 #### Place an entry in the edge_map if edge.qedge_id not in self.edge_map['by_qedge_id']: self.edge_map['by_qedge_id'][edge.qedge_id] = {} self.edge_map['by_qedge_id'][edge.qedge_id][key] = 1 if n_edges_with_query_graph_ids == self.n_edges: self.query_graph_id_edge_status = 'all edges have query_graph_ids' elif n_edges_with_query_graph_ids == 0: self.query_graph_id_edge_status = 'no edges have query_graph_ids' else: self.query_graph_id_edge_status = 'only some edges have query_graph_ids' response.info( f"In the KnowledgeGraph, {self.query_graph_id_edge_status}") #### Return the response return response