示例#1
0
    def _add_answers_to_kg(self, answer_kg: QGOrganizedKnowledgeGraph, reasoner_std_response: Dict[str, any],
                           input_qnode_key: str, output_qnode_key: str, qedge_key: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict(reasoner_std_response['results'])
        if reasoner_std_response['knowledge_graph']['edges']:
            remapped_node_keys = dict()
            log.debug(f"Got results back from BTE for this query "
                      f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)")

            for node in reasoner_std_response['knowledge_graph']['nodes']:
                swagger_node = Node()
                bte_node_key = node.get('id')
                swagger_node.name = node.get('name')
                swagger_node.category = eu.convert_to_list(eu.convert_string_to_snake_case(node.get('type')))

                # Map the returned BTE qg_ids back to the original qnode_keys in our query graph
                bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_key)
                if bte_qg_id == "n0":
                    qnode_key = input_qnode_key
                elif bte_qg_id == "n1":
                    qnode_key = output_qnode_key
                else:
                    log.error("Could not map BTE qg_id to ARAX qnode_key", error_code="UnknownQGID")
                    return answer_kg

                # Find and use the preferred equivalent identifier for this node (if it's an output node)
                if qnode_key == output_qnode_key:
                    if bte_node_key in remapped_node_keys:
                        swagger_node_key = remapped_node_keys.get(bte_node_key)
                    else:
                        equivalent_curies = [f"{prefix}:{eu.get_curie_local_id(local_id)}" for prefix, local_ids in
                                             node.get('equivalent_identifiers').items() for local_id in local_ids]
                        swagger_node_key = self._get_best_equivalent_bte_curie(equivalent_curies, swagger_node.category[0])
                        remapped_node_keys[bte_node_key] = swagger_node_key
                else:
                    swagger_node_key = bte_node_key

                answer_kg.add_node(swagger_node_key, swagger_node, qnode_key)

            for edge in reasoner_std_response['knowledge_graph']['edges']:
                swagger_edge = Edge()
                swagger_edge_key = edge.get("id")
                swagger_edge.predicate = edge.get('type')
                swagger_edge.subject = remapped_node_keys.get(edge.get('source_id'), edge.get('source_id'))
                swagger_edge.object = remapped_node_keys.get(edge.get('target_id'), edge.get('target_id'))
                swagger_edge.attributes = [Attribute(name="provided_by", value=edge.get('edge_source'), type=eu.get_attribute_type("provided_by")),
                                           Attribute(name="is_defined_by", value="BTE", type=eu.get_attribute_type("is_defined_by"))]
                # Map the returned BTE qg_id back to the original qedge_key in our query graph
                bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge_key)
                if bte_qg_id != "e1":
                    log.error("Could not map BTE qg_id to ARAX qedge_key", error_code="UnknownQGID")
                    return answer_kg
                answer_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key)

        return answer_kg
示例#2
0
    def answer_single_node_query(
            self, single_node_qg: QueryGraph) -> QGOrganizedKnowledgeGraph:
        kg_name = self.kg_name
        use_synonyms = self.use_synonyms
        log = self.response
        final_kg = QGOrganizedKnowledgeGraph()
        single_node_qg = eu.make_qg_use_old_types(
            single_node_qg)  # Temporary patch until we switch to KG2.5.1
        qnode_key = next(qnode_key for qnode_key in single_node_qg.nodes)
        qnode = single_node_qg.nodes[qnode_key]

        # Convert qnode curies as needed (either to synonyms or to canonical versions)
        if qnode.id:
            if use_synonyms and kg_name == "KG1":
                qnode.id = eu.get_curie_synonyms(qnode.id, log)
                qnode.category = [
                ]  # Important to clear this, otherwise results are limited (#889)
            elif kg_name == "KG2c":
                qnode.id = eu.get_canonical_curies_list(qnode.id, log)
                qnode.category = [
                ]  # Important to clear this to avoid discrepancies in types for particular concepts

        # Build and run a cypher query to get this node/nodes
        where_clause = f"{qnode_key}.id='{qnode.id}'" if type(
            qnode.id) is str else f"{qnode_key}.id in {qnode.id}"
        cypher_query = f"MATCH {self._get_cypher_for_query_node(qnode_key, single_node_qg, kg_name)} WHERE {where_clause} RETURN {qnode_key}"
        log.info(
            f"Sending cypher query for node {qnode_key} to {kg_name} neo4j")
        results = self._run_cypher_query(cypher_query, kg_name, log)

        # Load the results into swagger object model and add to our answer knowledge graph
        for result in results:
            neo4j_node = result.get(qnode_key)
            swagger_node_key, swagger_node = self._convert_neo4j_node_to_swagger_node(
                neo4j_node, kg_name)
            final_kg.add_node(swagger_node_key, swagger_node, qnode_key)

        # TODO: remove this patch once we switch to KG2.5.0!
        eu.convert_node_and_edge_types_to_new_format(final_kg)

        return final_kg
示例#3
0
    def _load_answers_into_kg(self, neo4j_results: List[Dict[str,
                                                             List[Dict[str,
                                                                       any]]]],
                              kg_name: str, qg: QueryGraph,
                              log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        log.debug(
            f"Processing query results for edge {next(qedge_key for qedge_key in qg.edges)}"
        )
        final_kg = QGOrganizedKnowledgeGraph()
        node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict(
            neo4j_results[0]) if kg_name == "KG1" else dict()

        results_table = neo4j_results[0]
        column_names = [column_name for column_name in results_table]
        for column_name in column_names:
            # Load answer nodes into our knowledge graph
            if column_name.startswith(
                    'nodes'):  # Example column name: 'nodes_n00'
                column_qnode_key = column_name.replace("nodes_", "", 1)
                for neo4j_node in results_table.get(column_name):
                    node_key, node = self._convert_neo4j_node_to_trapi_node(
                        neo4j_node, kg_name)
                    final_kg.add_node(node_key, node, column_qnode_key)
            # Load answer edges into our knowledge graph
            elif column_name.startswith(
                    'edges'):  # Example column name: 'edges_e01'
                column_qedge_key = column_name.replace("edges_", "", 1)
                for neo4j_edge in results_table.get(column_name):
                    edge_key, edge = self._convert_neo4j_edge_to_trapi_edge(
                        neo4j_edge, node_uuid_to_curie_dict, kg_name)
                    final_kg.add_edge(edge_key, edge, column_qedge_key)

        return final_kg
示例#4
0
    def _answer_single_node_query_using_neo4j(self, qnode_key: str,
                                              qg: QueryGraph, kg_name: str,
                                              log: ARAXResponse):
        qnode = qg.nodes[qnode_key]
        answer_kg = QGOrganizedKnowledgeGraph()

        # Build and run a cypher query to get this node/nodes
        where_clause = f"{qnode_key}.id='{qnode.id}'" if type(
            qnode.id) is str else f"{qnode_key}.id in {qnode.id}"
        cypher_query = f"MATCH {self._get_cypher_for_query_node(qnode_key, qg)} WHERE {where_clause} RETURN {qnode_key}"
        log.info(
            f"Sending cypher query for node {qnode_key} to {kg_name} neo4j")
        results = self._run_cypher_query(cypher_query, kg_name, log)

        # Load the results into API object model and add to our answer knowledge graph
        for result in results:
            neo4j_node = result.get(qnode_key)
            node_key, node = self._convert_neo4j_node_to_trapi_node(
                neo4j_node, kg_name)
            answer_kg.add_node(node_key, node, qnode_key)

        return answer_kg
示例#5
0
    def answer_one_hop_query(self, query_graph: QueryGraph) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]:
        """
        This function answers a one-hop (single-edge) query using BTE.
        :param query_graph: A Reasoner API standard query graph.
        :return: A tuple containing:
            1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as
           results for the query. (Dictionary version, organized by QG IDs.)
            2. a map of which nodes fulfilled which qnode_keys for each edge. Example:
              {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}}
        """
        enforce_directionality = self.response.data['parameters'].get('enforce_directionality')
        use_synonyms = self.response.data['parameters'].get('use_synonyms')
        log = self.response
        answer_kg = QGOrganizedKnowledgeGraph()
        edge_to_nodes_map = dict()
        valid_bte_inputs_dict = self._get_valid_bte_inputs_dict()
        query_graph = eu.make_qg_use_old_types(query_graph)  # Temporary patch until KP is TRAPI 1.0 compliant

        # Validate our input to make sure it will work with BTE
        input_qnode_key, output_qnode_key = self._validate_and_pre_process_input(qg=query_graph,
                                                                                 valid_bte_inputs_dict=valid_bte_inputs_dict,
                                                                                 enforce_directionality=enforce_directionality,
                                                                                 use_synonyms=use_synonyms,
                                                                                 log=log)
        if log.status != 'OK':
            return answer_kg, edge_to_nodes_map
        input_qnode = query_graph.nodes[input_qnode_key]
        output_qnode = query_graph.nodes[output_qnode_key]

        # Use BTE to answer the query
        answer_kg, accepted_curies = self._answer_query_using_bte(input_qnode_key=input_qnode_key,
                                                                  output_qnode_key=output_qnode_key,
                                                                  qg=query_graph,
                                                                  answer_kg=answer_kg,
                                                                  valid_bte_inputs_dict=valid_bte_inputs_dict,
                                                                  log=log)
        if log.status != 'OK':
            return answer_kg, edge_to_nodes_map

        # Hack to achieve a curie-to-curie query, if necessary
        if eu.qg_is_fulfilled(query_graph, answer_kg) and input_qnode.id and output_qnode.id:
            answer_kg = self._prune_answers_to_achieve_curie_to_curie_query(answer_kg, output_qnode_key, query_graph)

        # Report our findings
        if eu.qg_is_fulfilled(query_graph, answer_kg):
            answer_kg = eu.switch_kg_to_arax_curie_format(answer_kg)
            edge_to_nodes_map = self._create_edge_to_nodes_map(answer_kg, input_qnode_key, output_qnode_key)
        elif not accepted_curies:
            log.warning(f"BTE could not accept any of the input curies. Valid curie prefixes for BTE are: "
                        f"{valid_bte_inputs_dict['curie_prefixes']}")
        return answer_kg, edge_to_nodes_map
示例#6
0
    def _answer_query_using_neo4j(
            self, qg: QueryGraph, kg_name: str, qedge_key: str,
            enforce_directionality: bool,
            log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        answer_kg = QGOrganizedKnowledgeGraph()
        cypher_query = self._convert_one_hop_query_graph_to_cypher_query(
            qg, enforce_directionality, log)
        if log.status != 'OK':
            return answer_kg
        neo4j_results = self._send_query_to_neo4j(cypher_query, qedge_key,
                                                  kg_name, log)
        if log.status != 'OK':
            return answer_kg
        answer_kg = self._load_answers_into_kg(neo4j_results, kg_name, qg, log)
        if log.status != 'OK':
            return answer_kg

        return answer_kg
示例#7
0
    def answer_single_node_query(self, single_node_qg: QueryGraph) -> QGOrganizedKnowledgeGraph:
        log = self.response
        qnode_key = next(qnode_key for qnode_key in single_node_qg.nodes)
        qnode = single_node_qg.nodes[qnode_key]
        final_kg = QGOrganizedKnowledgeGraph()

        # Convert qnode curies as needed (either to synonyms or to canonical versions)
        if qnode.ids:
            qnode.ids = eu.get_canonical_curies_list(qnode.ids, log)
            qnode.categories = None  # Important to clear this to avoid discrepancies in types for particular concepts

        # Send request to plover
        plover_answer, response_status = self._answer_query_using_plover(single_node_qg, log)
        if response_status == 200:
            final_kg = self._load_plover_answer_into_object_model(plover_answer, log)
        else:
            log.error(f"Plover returned response of {response_status}. Answer was: {plover_answer}", error_code="RequestFailed")

        return final_kg
示例#8
0
 def _prune_highly_connected_nodes(kg: QGOrganizedKnowledgeGraph, qedge_key: str, input_curies: Set[str],
                                   input_qnode_key: str, max_edges_per_input_curie: int, log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
     # First create a lookup of which edges belong to which input curies
     input_nodes_to_edges_dict = defaultdict(set)
     for edge_key, edge in kg.edges_by_qg_id[qedge_key].items():
         if edge.subject in input_curies:
             input_nodes_to_edges_dict[edge.subject].add(edge_key)
         if edge.object in input_curies:
             input_nodes_to_edges_dict[edge.object].add(edge_key)
     # Then prune down highly-connected nodes (delete edges per input curie in excess of some set limit)
     for node_key, connected_edge_keys in input_nodes_to_edges_dict.items():
         connected_edge_keys_list = list(connected_edge_keys)
         if len(connected_edge_keys_list) > max_edges_per_input_curie:
             random.shuffle(connected_edge_keys_list)  # Make it random which edges we keep for this input curie
             edge_keys_to_remove = connected_edge_keys_list[max_edges_per_input_curie:]
             log.debug(f"Randomly removing {len(edge_keys_to_remove)} edges from answer for input curie {node_key}")
             for edge_key in edge_keys_to_remove:
                 kg.edges_by_qg_id[qedge_key].pop(edge_key, None)
             # Document that not all answers for this input curie are included
             node = kg.nodes_by_qg_id[input_qnode_key].get(node_key)
             if node:
                 if not node.attributes:
                     node.attributes = []
                 if not any(attribute.attribute_type_id == "biolink:incomplete_result_set"
                            for attribute in node.attributes):
                     node.attributes.append(Attribute(attribute_type_id="biolink:incomplete_result_set",  # TODO: request this as actual biolink item?
                                                      value_type_id="metatype:Boolean",
                                                      value=True,
                                                      attribute_source="infores:rtx-kg2",
                                                      description=f"This attribute indicates that not all "
                                                                  f"nodes/edges returned as answers for this input "
                                                                  f"curie were included in the final answer due to "
                                                                  f"size limitations. {max_edges_per_input_curie} "
                                                                  f"edges for this input curie were kept."))
     # Then delete any nodes orphaned by removal of edges
     node_keys_used_by_edges = kg.get_all_node_keys_used_by_edges()
     for qnode_key, nodes in kg.nodes_by_qg_id.items():
         orphan_node_keys = set(nodes).difference(node_keys_used_by_edges)
         if orphan_node_keys:
             log.debug(f"Removing {len(orphan_node_keys)} {qnode_key} nodes orphaned by the above step")
             for orphan_node_key in orphan_node_keys:
                 del kg.nodes_by_qg_id[qnode_key][orphan_node_key]
     return kg
示例#9
0
    def _load_answers_into_kg(
        self, neo4j_results: List[Dict[str, List[Dict[str, any]]]],
        kg_name: str, qg: QueryGraph, log: ARAXResponse
    ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]:
        log.debug(
            f"Processing query results for edge {next(qedge_key for qedge_key in qg.edges)}"
        )
        final_kg = QGOrganizedKnowledgeGraph()
        edge_to_nodes_map = dict()
        node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict(
            neo4j_results[0]) if kg_name == "KG1" else dict()

        results_table = neo4j_results[0]
        column_names = [column_name for column_name in results_table]
        for column_name in column_names:
            # Load answer nodes into our knowledge graph
            if column_name.startswith(
                    'nodes'):  # Example column name: 'nodes_n00'
                column_qnode_key = column_name.replace("nodes_", "", 1)
                for neo4j_node in results_table.get(column_name):
                    swagger_node_key, swagger_node = self._convert_neo4j_node_to_swagger_node(
                        neo4j_node, kg_name)
                    final_kg.add_node(swagger_node_key, swagger_node,
                                      column_qnode_key)
            # Load answer edges into our knowledge graph
            elif column_name.startswith(
                    'edges'):  # Example column name: 'edges_e01'
                column_qedge_key = column_name.replace("edges_", "", 1)
                for neo4j_edge in results_table.get(column_name):
                    swagger_edge_key, swagger_edge = self._convert_neo4j_edge_to_swagger_edge(
                        neo4j_edge, node_uuid_to_curie_dict, kg_name)

                    # Record which of this edge's nodes correspond to which qnode_key
                    if swagger_edge_key not in edge_to_nodes_map:
                        edge_to_nodes_map[swagger_edge_key] = dict()
                    for qnode_key in qg.nodes:
                        edge_to_nodes_map[swagger_edge_key][
                            qnode_key] = neo4j_edge.get(qnode_key)

                    # Finally add the current edge to our answer knowledge graph
                    final_kg.add_edge(swagger_edge_key, swagger_edge,
                                      column_qedge_key)

        return final_kg, edge_to_nodes_map
示例#10
0
    def answer_one_hop_query(
            self, query_graph: QueryGraph) -> QGOrganizedKnowledgeGraph:
        """
        This function answers a one-hop (single-edge) query using CHP client.
        :param query_graph: A TRAPI query graph.
        :return: An (almost) TRAPI knowledge graph containing all of the nodes and edges returned as
                results for the query. (Organized by QG IDs.)
        """
        # Set up the required parameters
        log = self.response
        self.CHP_survival_threshold = float(
            self.response.data['parameters']['CHP_survival_threshold'])
        allowable_curies = self.client.curies()
        self.allowable_gene_curies = list(
            allowable_curies['biolink:Gene'].keys())
        self.allowable_drug_curies = [
            curie_id.replace('CHEMBL:', 'CHEMBL.COMPOUND:')
            for curie_id in list(allowable_curies['biolink:Drug'].keys())
        ]
        final_kg = QGOrganizedKnowledgeGraph()

        final_kg = self._answer_query_using_CHP_client(query_graph, log)

        return final_kg
示例#11
0
 def _load_plover_answer_into_object_model(self, plover_answer: Dict[str, Dict[str, Union[set, dict]]],
                                           log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
     answer_kg = QGOrganizedKnowledgeGraph()
     # Load returned nodes into TRAPI object model
     for qnode_key, nodes in plover_answer["nodes"].items():
         num_nodes = len(nodes)
         log.debug(f"Loading {num_nodes} {qnode_key} nodes into TRAPI object model")
         start = time.time()
         for node_key, node_tuple in nodes.items():
             node = self._convert_kg2c_plover_node_to_trapi_node(node_tuple)
             answer_kg.add_node(node_key, node, qnode_key)
         log.debug(f"Loading {num_nodes} {qnode_key} nodes into TRAPI object model took "
                   f"{round(time.time() - start, 2)} seconds")
     # Load returned edges into TRAPI object model
     for qedge_key, edges in plover_answer["edges"].items():
         num_edges = len(edges)
         log.debug(f"Loading {num_edges} edges into TRAPI object model")
         start = time.time()
         for edge_key, edge_tuple in edges.items():
             edge = self._convert_kg2c_plover_edge_to_trapi_edge(edge_tuple)
             answer_kg.add_edge(edge_key, edge, qedge_key)
         log.debug(f"Loading {num_edges} {qedge_key} edges into TRAPI object model took "
                   f"{round(time.time() - start, 2)} seconds")
     return answer_kg
示例#12
0
    def answer_one_hop_query(
            self, query_graph: QueryGraph) -> QGOrganizedKnowledgeGraph:
        """
        This function answers a one-hop (single-edge) query using either KG1 or KG2.
        :param query_graph: A TRAPI query graph.
        :return: An (almost) TRAPI knowledge graph containing all of the nodes and edges returned as
                results for the query. (Organized by QG IDs.)
        """
        log = self.response
        enforce_directionality = self.enforce_directionality
        use_synonyms = self.use_synonyms
        kg_name = self.kg_name
        if kg_name == "KG1":
            query_graph = eu.make_qg_use_old_snake_case_types(query_graph)
        final_kg = QGOrganizedKnowledgeGraph()

        # Verify this is a valid one-hop query graph
        if len(query_graph.edges) != 1:
            log.error(
                f"answer_one_hop_query() was passed a query graph that is not one-hop: "
                f"{query_graph.to_dict()}",
                error_code="InvalidQuery")
            return final_kg
        if len(query_graph.nodes) != 2:
            log.error(
                f"answer_one_hop_query() was passed a query graph with more than two nodes: "
                f"{query_graph.to_dict()}",
                error_code="InvalidQuery")
            return final_kg
        qedge_key = next(qedge_key for qedge_key in query_graph.edges)

        # Consider any inverses of our predicate(s) as well
        query_graph = self._add_inverted_predicates(query_graph, log)

        # Convert qnode curies as needed (either to synonyms or to canonical versions)
        qnode_keys_with_curies = [
            qnode_key for qnode_key, qnode in query_graph.nodes.items()
            if qnode.id
        ]
        for qnode_key in qnode_keys_with_curies:
            qnode = query_graph.nodes[qnode_key]
            if use_synonyms and kg_name == "KG1":
                qnode.id = eu.get_curie_synonyms(qnode.id, log)
            elif kg_name == "KG2c":
                canonical_curies = eu.get_canonical_curies_list(qnode.id, log)
                log.debug(
                    f"Using {len(canonical_curies)} curies as canonical curies for qnode {qnode_key}"
                )
                qnode.id = canonical_curies
            qnode.category = [
            ]  # Important to clear this, otherwise results are limited (#889)

        if kg_name == "KG2c":
            # Use Plover to answer KG2c queries
            plover_answer, response_status = self._answer_query_using_plover(
                query_graph, log)
            if response_status == 200:
                final_kg = self._grab_nodes_and_edges_from_sqlite(
                    plover_answer, kg_name, log)
            else:
                # Backup to using neo4j in the event plover failed
                log.warning(
                    f"Plover returned a {response_status} response, so I'm backing up to Neo4j.."
                )
                final_kg = self._answer_query_using_neo4j(
                    query_graph, kg_name, qedge_key, enforce_directionality,
                    log)
        else:
            # Use Neo4j for KG2 and KG1 queries
            final_kg = self._answer_query_using_neo4j(query_graph, kg_name,
                                                      qedge_key,
                                                      enforce_directionality,
                                                      log)

        return final_kg
示例#13
0
    def answer_one_hop_query(
        self, query_graph: QueryGraph
    ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]:
        """
        This function answers a one-hop (single-edge) query using NGD (with the assistance of KG2).
        :param query_graph: A Reasoner API standard query graph.
        :return: A tuple containing:
            1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as
           results for the query. (Dictionary version, organized by QG IDs.)
            2. a map of which nodes fulfilled which qnode_keys for each edge. Example:
              {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}}
        """
        log = self.response
        final_kg = QGOrganizedKnowledgeGraph()
        edge_to_nodes_map = dict()

        # Verify this is a valid one-hop query graph
        self._verify_one_hop_query_graph_is_valid(query_graph, log)
        if log.status != 'OK':
            return final_kg, edge_to_nodes_map

        # Find potential answers using KG2
        log.debug(f"Finding potential answers using KG2")
        qedge_key = next(qedge_key for qedge_key in query_graph.edges)
        qedge = query_graph.edges[qedge_key]
        source_qnode_key = qedge.subject
        target_qnode_key = qedge.object
        source_qnode = query_graph.nodes[source_qnode_key]
        target_qnode = query_graph.nodes[target_qnode_key]
        qedge_params_str = ", ".join(
            list(
                filter(None, [
                    f"key={qedge_key}", f"subject={source_qnode_key}",
                    f"object={target_qnode_key}",
                    self._get_dsl_qedge_type_str(qedge)
                ])))
        source_params_str = ", ".join(
            list(
                filter(None, [
                    f"key={source_qnode_key}",
                    self._get_dsl_qnode_curie_str(source_qnode),
                    self._get_dsl_qnode_category_str(source_qnode)
                ])))
        target_params_str = ", ".join(
            list(
                filter(None, [
                    f"key={target_qnode_key}",
                    self._get_dsl_qnode_curie_str(target_qnode),
                    self._get_dsl_qnode_category_str(target_qnode)
                ])))
        actions_list = [
            f"add_qnode({source_params_str})",
            f"add_qnode({target_params_str})",
            f"add_qedge({qedge_params_str})",
            f"expand(kp=ARAX/KG2)",
            f"return(message=true, store=false)",
        ]
        kg2_response, kg2_message = self._run_arax_query(actions_list, log)
        if log.status != 'OK':
            return final_kg, edge_to_nodes_map

        # Go through those answers from KG2 and calculate ngd for each edge
        log.debug(f"Calculating NGD between each potential node pair")
        kg2_answer_kg = kg2_message.knowledge_graph
        cngd = ComputeNGD(log, kg2_message, None)
        cngd.load_curie_to_pmids_data(kg2_answer_kg.nodes)
        kg2_edge_ngd_map = dict()
        for kg2_edge_key, kg2_edge in kg2_answer_kg.edges.items():
            kg2_node_1_key = kg2_edge.subject
            kg2_node_2_key = kg2_edge.object
            kg2_node_1 = kg2_answer_kg.nodes.get(
                kg2_node_1_key
            )  # These are already canonicalized (default behavior)
            kg2_node_2 = kg2_answer_kg.nodes.get(kg2_node_2_key)
            # Figure out which node corresponds to source qnode (don't necessarily match b/c query was bidirectional)
            if source_qnode_key in kg2_node_1.qnode_keys and target_qnode_key in kg2_node_2.qnode_keys:
                ngd_subject = kg2_node_1_key
                ngd_object = kg2_node_2_key
            else:
                ngd_subject = kg2_node_2_key
                ngd_object = kg2_node_1_key
            ngd_value = cngd.calculate_ngd_fast(ngd_subject, ngd_object)
            kg2_edge_ngd_map[kg2_edge_key] = {
                "ngd_value": ngd_value,
                "subject": ngd_subject,
                "object": ngd_object
            }

        # Create edges for those from KG2 found to have a low enough ngd value
        threshold = 0.5
        log.debug(
            f"Creating edges between node pairs with NGD below the threshold ({threshold})"
        )
        for kg2_edge_key, ngd_info_dict in kg2_edge_ngd_map.items():
            ngd_value = ngd_info_dict['ngd_value']
            if ngd_value is not None and ngd_value < threshold:  # TODO: Make determination of the threshold much more sophisticated
                subject = ngd_info_dict["subject"]
                object = ngd_info_dict["object"]
                ngd_edge_key, ngd_edge = self._create_ngd_edge(
                    ngd_value, subject, object)
                ngd_source_node_key, ngd_source_node = self._create_ngd_node(
                    ngd_edge.subject,
                    kg2_answer_kg.nodes.get(ngd_edge.subject))
                ngd_target_node_key, ngd_target_node = self._create_ngd_node(
                    ngd_edge.object, kg2_answer_kg.nodes.get(ngd_edge.object))
                final_kg.add_edge(ngd_edge_key, ngd_edge, qedge_key)
                final_kg.add_node(ngd_source_node_key, ngd_source_node,
                                  source_qnode_key)
                final_kg.add_node(ngd_target_node_key, ngd_target_node,
                                  target_qnode_key)
                edge_to_nodes_map[ngd_edge_key] = {
                    source_qnode_key: ngd_source_node_key,
                    target_qnode_key: ngd_target_node_key
                }

        return final_kg, edge_to_nodes_map
示例#14
0
    def _answer_query_using_CHP_client(
            self, query_graph: QueryGraph,
            log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        qedge_key = next(qedge_key for qedge_key in query_graph.edges)
        log.debug(
            f"Processing query results for edge {qedge_key} by using CHP client"
        )
        final_kg = QGOrganizedKnowledgeGraph()
        gene_label_list = ['gene']
        drug_label_list = ['drug', 'chemicalsubstance']
        # use for checking the requirement
        source_pass_nodes = None
        source_category = None
        target_pass_nodes = None
        target_category = None

        qedge = query_graph.edges[qedge_key]
        source_qnode_key = qedge.subject
        target_qnode_key = qedge.object
        source_qnode = query_graph.nodes[source_qnode_key]
        target_qnode = query_graph.nodes[target_qnode_key]

        # check if both ends of edge have no curie
        if (source_qnode.id is None) and (target_qnode.id is None):
            log.error(f"Both ends of edge {qedge_key} are None",
                      error_code="BadEdge")
            return final_kg

        # check if the query nodes are drug or disease
        if source_qnode.id is not None:

            if type(source_qnode.id) is str:
                source_pass_nodes = [source_qnode.id]
            else:
                source_pass_nodes = source_qnode.id
            has_error, pass_nodes, not_pass_nodes = self._check_id(
                source_qnode.id, log)
            if has_error:
                return final_kg
            else:
                if len(not_pass_nodes) == 0 and len(pass_nodes) != 0:
                    source_pass_nodes = pass_nodes
                elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0:
                    source_pass_nodes = pass_nodes
                    if len(not_pass_nodes) == 1:
                        log.warning(
                            f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client"
                        )
                    else:
                        log.warning(
                            f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client"
                        )
                else:
                    if type(source_qnode.id) is str:
                        log.error(
                            f"The curie id of {source_qnode.id} is not allowable based on CHP client",
                            error_code="NotAllowable")
                        return final_kg
                    else:
                        log.error(
                            f"The curie ids of {source_qnode.id} are not allowable based on CHP client",
                            error_code="NotAllowable")
                        return final_kg
        else:
            category = source_qnode.category[0].replace(
                'biolink:', '').replace('_', '').lower()
            source_category = category
            if (category in drug_label_list) or (category in gene_label_list):
                source_category = category
            else:
                log.error(
                    f"The category of query node {source_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene",
                    error_code="CategoryError")
                return final_kg

        if target_qnode.id is not None:

            if type(target_qnode.id) is str:
                target_pass_nodes = [target_qnode.id]
            else:
                target_pass_nodes = target_qnode.id
            has_error, pass_nodes, not_pass_nodes = self._check_id(
                target_qnode.id, log)
            if has_error:
                return final_kg
            else:
                if len(not_pass_nodes) == 0 and len(pass_nodes) != 0:
                    target_pass_nodes = pass_nodes
                elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0:
                    target_pass_nodes = pass_nodes
                    if len(not_pass_nodes) == 1:
                        log.warning(
                            f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client"
                        )
                    else:
                        log.warning(
                            f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client"
                        )
                else:
                    if type(target_qnode.id) is str:
                        log.error(
                            f"The curie id of {target_qnode.id} is not allowable based on CHP client",
                            error_code="CategoryError")
                        return final_kg
                    else:
                        log.error(
                            f"The curie ids of {target_qnode.id} are not allowable based on CHP client",
                            error_code="CategoryError")
                        return final_kg
        else:
            category = target_qnode.category[0].replace(
                'biolink:', '').replace('_', '').lower()
            target_category = category
            if (category in drug_label_list) or (category in gene_label_list):
                target_category = category
            else:
                log.error(
                    f"The category of query node {target_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene",
                    error_code="CategoryError")
                return final_kg

        if (source_pass_nodes is None) and (target_pass_nodes is None):
            return final_kg

        elif (source_pass_nodes is not None) and (target_pass_nodes
                                                  is not None):
            source_dict = dict()
            target_dict = dict()
            if source_pass_nodes[0] in self.allowable_drug_curies:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if target_pass_nodes[0] in self.allowable_drug_curies:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                for (source_curie, target_curie) in itertools.product(
                        source_pass_nodes, target_pass_nodes):

                    if source_category_temp == 'drug':
                        source_curie_temp = source_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        # Let's build a simple single query
                        q = build_query(genes=[target_curie],
                                        therapeutic=source_curie_temp,
                                        disease='MONDO:0007254',
                                        outcome=('EFO:0000714', '>=',
                                                 self.CHP_survival_threshold))

                        response = self.client.query(q)
                        max_probability = self.client.get_outcome_prob(
                            response)
                        swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                            target_curie, source_curie, "paired_with",
                            max_probability)
                    else:
                        target_curie_temp = target_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        # Let's build a simple single query
                        q = build_query(genes=[source_curie],
                                        therapeutic=target_curie_temp,
                                        disease='MONDO:0007254',
                                        outcome=('EFO:0000714', '>=',
                                                 self.CHP_survival_threshold))

                        response = self.client.query(q)
                        max_probability = self.client.get_outcome_prob(
                            response)
                        swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                            source_curie, target_curie, "paired_with",
                            max_probability)

                    source_dict[source_curie] = source_qnode_key
                    target_dict[target_curie] = target_qnode_key

                    # Finally add the current edge to our answer knowledge graph
                    final_kg.add_edge(swagger_edge_key, swagger_edge,
                                      qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg

        elif source_pass_nodes is not None:
            source_dict = dict()
            target_dict = dict()

            if source_pass_nodes[0] in self.allowable_drug_curies:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if target_category in drug_label_list:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                if source_category_temp == 'drug':
                    for source_curie in source_pass_nodes:

                        genes = [
                            curie for curie in self.allowable_gene_curies
                            if self.synonymizer.get_canonical_curies(curie)
                            [curie] is not None and target_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower() for category in list(
                                        self.synonymizer.get_canonical_curies(
                                            curie, return_all_categories=True)
                                        [curie]['all_categories'].keys())
                            ]
                        ]
                        therapeutic = source_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for gene in genes:
                            queries.append(
                                build_query(
                                    genes=[gene],
                                    therapeutic=therapeutic,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, gene in zip(res["message"], genes):
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                gene, source_curie, "paired_with", prob)

                            source_dict[source_curie] = source_qnode_key
                            target_dict[gene] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)
                else:
                    for source_curie in source_pass_nodes:

                        genes = [source_curie]
                        therapeutic = [
                            curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:')
                            for curie in self.allowable_drug_curies
                            if self.synonymizer.get_canonical_curies(
                                curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'))
                            [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')]
                            is not None and target_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower()
                                for category in list(
                                    self.synonymizer.get_canonical_curies(
                                        curie.replace('CHEMBL:',
                                                      'CHEMBL.COMPOUND:'),
                                        return_all_categories=True)[
                                            curie.replace(
                                                'CHEMBL:', 'CHEMBL.COMPOUND:')]
                                    ['all_categories'].keys())
                            ]
                        ]
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for drug in therapeutic:
                            queries.append(
                                build_query(
                                    genes=genes,
                                    therapeutic=drug,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, drug in zip(res["message"], therapeutic):
                            drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:')
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                source_curie, drug, "paired_with", prob)

                            source_dict[source_curie] = source_qnode_key
                            target_dict[drug] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg
        else:
            source_dict = dict()
            target_dict = dict()

            if target_pass_nodes[0] in self.allowable_drug_curies:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category in drug_label_list:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                if target_category_temp == 'drug':
                    for target_curie in target_pass_nodes:

                        genes = [
                            curie for curie in self.allowable_gene_curies
                            if self.synonymizer.get_canonical_curies(curie)
                            [curie] is not None and source_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower() for category in list(
                                        self.synonymizer.get_canonical_curies(
                                            curie, return_all_categories=True)
                                        [curie]['all_categories'].keys())
                            ]
                        ]
                        therapeutic = target_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for gene in genes:
                            queries.append(
                                build_query(
                                    genes=[gene],
                                    therapeutic=therapeutic,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, gene in zip(res["message"], genes):
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                gene, target_curie, "paired_with", prob)

                            source_dict[gene] = source_qnode_key
                            target_dict[target_curie] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                else:
                    for target_curie in target_pass_nodes:

                        genes = [target_curie]
                        therapeutic = [
                            curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:')
                            for curie in self.allowable_drug_curies
                            if self.synonymizer.get_canonical_curies(
                                curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'))
                            [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')]
                            is not None and source_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower()
                                for category in list(
                                    self.synonymizer.get_canonical_curies(
                                        curie.replace('CHEMBL:',
                                                      'CHEMBL.COMPOUND:'),
                                        return_all_categories=True)[
                                            curie.replace(
                                                'CHEMBL:', 'CHEMBL.COMPOUND:')]
                                    ['all_categories'].keys())
                            ]
                        ]
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for drug in therapeutic:
                            queries.append(
                                build_query(
                                    genes=genes,
                                    therapeutic=drug,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, drug in zip(res["message"], therapeutic):
                            drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:')
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                target_curie, drug, "paired_with", prob)

                            source_dict[drug] = source_qnode_key
                            target_dict[target_curie] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg
示例#15
0
    def answer_one_hop_query(
        self, query_graph: QueryGraph
    ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]:
        """
        This function answers a one-hop (single-edge) query using either KG1 or KG2.
        :param query_graph: A Reasoner API standard query graph.
        :return: A tuple containing:
            1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as
           results for the query. (Dictionary version, organized by QG IDs.)
            2. a map of which nodes fulfilled which qnode_keys for each edge. Example:
              {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}}
        """
        log = self.response
        enforce_directionality = self.enforce_directionality
        use_synonyms = self.use_synonyms
        kg_name = self.kg_name
        final_kg = QGOrganizedKnowledgeGraph()
        edge_to_nodes_map = dict()
        query_graph = eu.make_qg_use_old_types(
            query_graph)  # Temporary patch until we switch to KG2.5.1

        # Verify this is a valid one-hop query graph
        if len(query_graph.edges) != 1:
            log.error(
                f"answer_one_hop_query() was passed a query graph that is not one-hop: "
                f"{query_graph.to_dict()}",
                error_code="InvalidQuery")
            return final_kg, edge_to_nodes_map
        if len(query_graph.nodes) != 2:
            log.error(
                f"answer_one_hop_query() was passed a query graph with more than two nodes: "
                f"{query_graph.to_dict()}",
                error_code="InvalidQuery")
            return final_kg, edge_to_nodes_map
        qedge_key = next(qedge_key for qedge_key in query_graph.edges)

        # Convert qnode curies as needed (either to synonyms or to canonical versions)
        qnode_keys_with_curies = [
            qnode_key for qnode_key, qnode in query_graph.nodes.items()
            if qnode.id
        ]
        for qnode_key in qnode_keys_with_curies:
            qnode = query_graph.nodes[qnode_key]
            if use_synonyms and kg_name == "KG1":
                qnode.id = eu.get_curie_synonyms(qnode.id, log)
            elif kg_name == "KG2c":
                canonical_curies = eu.get_canonical_curies_list(qnode.id, log)
                log.debug(
                    f"Using {len(canonical_curies)} curies as canonical curies for qnode {qnode_key}"
                )
                qnode.id = canonical_curies
            qnode.category = [
            ]  # Important to clear this, otherwise results are limited (#889)

        # Run the actual query and process results
        cypher_query = self._convert_one_hop_query_graph_to_cypher_query(
            query_graph, enforce_directionality, kg_name, log)
        if log.status != 'OK':
            return final_kg, edge_to_nodes_map
        neo4j_results = self._answer_query_using_neo4j(cypher_query, qedge_key,
                                                       kg_name, log)
        if log.status != 'OK':
            return final_kg, edge_to_nodes_map
        final_kg, edge_to_nodes_map = self._load_answers_into_kg(
            neo4j_results, kg_name, query_graph, log)
        if log.status != 'OK':
            return final_kg, edge_to_nodes_map

        # TODO: remove this patch once we switch to KG2.5.0!
        eu.convert_node_and_edge_types_to_new_format(final_kg)

        return final_kg, edge_to_nodes_map
示例#16
0
    def answer_one_hop_query(self, query_graph: QueryGraph) -> QGOrganizedKnowledgeGraph:
        """
        This function answers a one-hop (single-edge) query using KG2c, via PloverDB.
        :param query_graph: A TRAPI query graph.
        :return: An (almost) TRAPI knowledge graph containing all of the nodes and edges returned as
                results for the query. (Organized by QG IDs.)
        """
        log = self.response
        final_kg = QGOrganizedKnowledgeGraph()

        # Verify this is a valid one-hop query graph
        if len(query_graph.edges) != 1:
            log.error(f"answer_one_hop_query() was passed a query graph that is not one-hop: "
                      f"{query_graph.to_dict()}", error_code="InvalidQuery")
            return final_kg
        if len(query_graph.nodes) != 2:
            log.error(f"answer_one_hop_query() was passed a query graph with more than two nodes: "
                      f"{query_graph.to_dict()}", error_code="InvalidQuery")
            return final_kg

        # Get canonical versions of the input curies
        qnode_keys_with_curies = [qnode_key for qnode_key, qnode in query_graph.nodes.items() if qnode.ids]
        for qnode_key in qnode_keys_with_curies:
            qnode = query_graph.nodes[qnode_key]
            canonical_curies = eu.get_canonical_curies_list(qnode.ids, log)
            log.debug(f"Using {len(canonical_curies)} curies as canonical curies for qnode {qnode_key}")
            qnode.ids = canonical_curies
            qnode.categories = None  # Important to clear this, otherwise results are limited (#889)

        # Send the query to plover in batches of input curies
        qedge_key = next(qedge_key for qedge_key in query_graph.edges)
        input_qnode_key = self._get_input_qnode_key(query_graph)
        input_curies = query_graph.nodes[input_qnode_key].ids
        input_curie_set = set(input_curies)
        curie_batches = [input_curies[i:i+self.curie_batch_size] for i in range(0, len(input_curies), self.curie_batch_size)]
        log.debug(f"Split {len(input_curies)} input curies into {len(curie_batches)} batches to send to Plover")
        log.info(f"Max edges allowed per input curie for this query is: {self.max_edges_per_input_curie}")
        batch_num = 1
        for curie_batch in curie_batches:
            log.debug(f"Sending batch {batch_num} to Plover (has {len(curie_batch)} input curies)")
            query_graph.nodes[input_qnode_key].ids = curie_batch
            plover_answer, response_status = self._answer_query_using_plover(query_graph, log)
            if response_status == 200:
                batch_kg = self._load_plover_answer_into_object_model(plover_answer, log)
                final_kg = eu.merge_two_kgs(batch_kg, final_kg)
                # Prune down highly-connected input curies if we're over the max number of allowed edges
                if final_kg.edges_by_qg_id.get(qedge_key):
                    if len(final_kg.edges_by_qg_id[qedge_key]) > self.max_allowed_edges:
                        log.debug(f"Have exceeded max num allowed edges ({self.max_allowed_edges}); will attempt to "
                                  f"reduce the number of edges by pruning down highly connected nodes")
                        final_kg = self._prune_highly_connected_nodes(final_kg, qedge_key, input_curie_set,
                                                                      input_qnode_key, self.max_edges_per_input_curie,
                                                                      log)
                    # Error out if this pruning wasn't sufficient to bring down the edge count
                    if len(final_kg.edges_by_qg_id[qedge_key]) > self.max_allowed_edges:
                        log.error(f"Query for qedge {qedge_key} produced more than {self.max_allowed_edges} edges, "
                                  f"which is too much for the system to handle. You must somehow make your query "
                                  f"smaller (specify fewer input curies or use more specific predicates/categories).",
                                  error_code="QueryTooLarge")
                        return final_kg
            else:
                log.error(f"Plover returned response of {response_status}. Answer was: {plover_answer}", error_code="RequestFailed")
                return final_kg
            batch_num += 1

        return final_kg
示例#17
0
    def answer_one_hop_query(
            self, query_graph: QueryGraph) -> QGOrganizedKnowledgeGraph:
        """
        This function answers a one-hop (single-edge) query using NGD (with the assistance of KG2).
        :param query_graph: A TRAPI query graph.
        :return: An (almost) TRAPI knowledge graph containing all of the nodes and edges returned as
                results for the query. (Organized by QG IDs.)
        """
        log = self.response
        final_kg = QGOrganizedKnowledgeGraph()

        # Verify this is a valid one-hop query graph
        self._verify_one_hop_query_graph_is_valid(query_graph, log)
        if log.status != 'OK':
            return final_kg
        qedge_key = next(qedge_key for qedge_key in query_graph.edges)
        qedge = query_graph.edges[qedge_key]
        if qedge.predicates and not set(qedge.predicates).intersection(
                self.accepted_qedge_predicates):
            log.error(
                f"NGD can only expand qedges with these predicates: {self.accepted_qedge_predicates}. QEdge"
                f" {qedge_key}'s predicate is: {qedge.predicates}",
                error_code="UnsupportedQG")
            return final_kg
        source_qnode_key = qedge.subject
        target_qnode_key = qedge.object

        # Find potential answers using KG2
        log.debug(f"Finding potential answers using KG2")
        modified_qg = copy.deepcopy(query_graph)
        for qedge in modified_qg.edges.values():
            qedge.predicates = None

        request_body = {"message": {"query_graph": modified_qg.to_dict()}}
        kg2_response, kg2_message = self._run_arax_query(request_body, log)
        if log.status != 'OK':
            return final_kg

        # Go through those answers from KG2 and calculate ngd for each edge
        log.debug(f"Calculating NGD between each potential node pair")
        kg2_answer_kg = kg2_message.knowledge_graph
        cngd = ComputeNGD(log, kg2_message, None)
        cngd.load_curie_to_pmids_data(kg2_answer_kg.nodes)
        kg2_edge_ngd_map = dict()
        for kg2_edge_key, kg2_edge in kg2_answer_kg.edges.items():
            kg2_node_1_key = kg2_edge.subject
            kg2_node_2_key = kg2_edge.object
            kg2_node_1 = kg2_answer_kg.nodes.get(
                kg2_node_1_key
            )  # These are already canonicalized (default behavior)
            kg2_node_2 = kg2_answer_kg.nodes.get(kg2_node_2_key)
            # Figure out which node corresponds to source qnode (don't necessarily match b/c query was bidirectional)
            if source_qnode_key in kg2_node_1.qnode_keys and target_qnode_key in kg2_node_2.qnode_keys:
                ngd_subject = kg2_node_1_key
                ngd_object = kg2_node_2_key
            else:
                ngd_subject = kg2_node_2_key
                ngd_object = kg2_node_1_key
            ngd_value, pmid_set = cngd.calculate_ngd_fast(
                ngd_subject, ngd_object)
            kg2_edge_ngd_map[kg2_edge_key] = {
                "ngd_value": ngd_value,
                "subject": ngd_subject,
                "object": ngd_object,
                "pmids": [f"PMID:{pmid}" for pmid in pmid_set]
            }

        # Create edges for those from KG2 found to have a low enough ngd value
        threshold = 0.5
        log.debug(
            f"Creating edges between node pairs with NGD below the threshold ({threshold})"
        )
        for kg2_edge_key, ngd_info_dict in kg2_edge_ngd_map.items():
            ngd_value = ngd_info_dict['ngd_value']
            if ngd_value is not None and ngd_value < threshold:  # TODO: Make determination of the threshold much more sophisticated
                subject = ngd_info_dict["subject"]
                object = ngd_info_dict["object"]
                pmid_list = ngd_info_dict["pmids"]
                ngd_edge_key, ngd_edge = self._create_ngd_edge(
                    ngd_value, subject, object, pmid_list)
                ngd_source_node_key, ngd_source_node = self._create_ngd_node(
                    ngd_edge.subject,
                    kg2_answer_kg.nodes.get(ngd_edge.subject))
                ngd_target_node_key, ngd_target_node = self._create_ngd_node(
                    ngd_edge.object, kg2_answer_kg.nodes.get(ngd_edge.object))
                final_kg.add_edge(ngd_edge_key, ngd_edge, qedge_key)
                final_kg.add_node(ngd_source_node_key, ngd_source_node,
                                  source_qnode_key)
                final_kg.add_node(ngd_target_node_key, ngd_target_node,
                                  target_qnode_key)

        return final_kg
示例#18
0
    def answer_one_hop_query(
        self, query_graph: QueryGraph
    ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]:
        """
        This function answers a one-hop (single-edge) query using the Genetics Provider.
        :param query_graph: A Reasoner API standard query graph.
        :return: A tuple containing:
            1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as
           results for the query. (Dictionary version, organized by QG IDs.)
            2. a map of which nodes fulfilled which qnode_keys for each edge. Example:
              {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}}
        """
        log = self.response
        include_all_scores = self.response.data['parameters'][
            'include_all_scores']
        final_kg = QGOrganizedKnowledgeGraph()
        edge_to_nodes_map = dict()
        query_graph = eu.make_qg_use_old_types(
            query_graph
        )  # Temporary patch until TRAPI 1.0 KP endpoint is ready

        # Verify this is a valid one-hop query graph and tweak its contents as needed for this KP
        self._verify_one_hop_query_graph_is_valid(query_graph, log)
        if log.status != 'OK':
            return final_kg, edge_to_nodes_map
        modified_query_graph = self._pre_process_query_graph(query_graph, log)
        if log.status != 'OK':
            return final_kg, edge_to_nodes_map
        qedge = next(qedge for qedge in modified_query_graph.edges.values())
        source_qnode_key = qedge.subject
        target_qnode_key = qedge.object

        # Answer the query using the KP and load its answers into our Swagger model
        json_response = self._send_query_to_kp(modified_query_graph, log)
        returned_kg = json_response.get('knowledge_graph')
        if not returned_kg:
            log.warning(
                f"No KG is present in the response from {self.kp_name}")
        else:
            # Build a map of node/edge IDs to qnode/qedge IDs
            qg_id_mappings = self._get_qg_id_mappings_from_results(
                json_response['results'])
            unknown_scores_encountered = set()
            # Populate our final KG with nodes and edges
            for returned_edge in returned_kg['edges']:
                # Skip edges missing a source and/or target ID (have encountered these before)
                if not returned_edge['source_id'] or not returned_edge[
                        'target_id']:
                    log.warning(
                        f"Edge returned from GeneticsKP is lacking a subject and/or object: {returned_edge}."
                        f" Will skip adding this edge to the KG.")
                else:
                    if returned_edge[
                            'score_name'] not in self.score_type_lookup:
                        unknown_scores_encountered.add(
                            returned_edge['score_name'])
                    # Always include edges for integrated scores, but only include magma edges if that flag is set
                    if include_all_scores or returned_edge[
                            'score_name'] == self.magma_score_name:
                        kp_edge_key, swagger_edge = self._create_swagger_edge_from_kp_edge(
                            returned_edge)
                        swagger_edge_key = self._create_unique_edge_key(
                            swagger_edge
                        )  # Convert to an ID that's unique for us
                        for qedge_key in qg_id_mappings['edges'][kp_edge_key]:
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)
                        edge_to_nodes_map[swagger_edge_key] = {
                            source_qnode_key: swagger_edge.subject,
                            target_qnode_key: swagger_edge.object
                        }
            if unknown_scores_encountered:
                log.warning(
                    f"Encountered unknown score(s) from {self.kp_name}: {unknown_scores_encountered}. "
                    f"Not sure what data type to assign these.")
            for returned_node in returned_kg['nodes']:
                if returned_node[
                        'id']:  # Skip any nodes with 'None' for their ID (see discussion in #1154)
                    swagger_node_key, swagger_node = self._create_swagger_node_from_kp_node(
                        returned_node)
                    for qnode_key in qg_id_mappings['nodes'][swagger_node_key]:
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          qnode_key)
                else:
                    log.warning(
                        f"Node returned from {self.kp_name} is lacking an ID: {returned_node}."
                        f" Will skip adding this node to the KG.")

        return final_kg, edge_to_nodes_map
示例#19
0
    def _grab_nodes_and_edges_from_sqlite(
            self, plover_answer: Dict[str, Dict[str, Set[Union[str, int]]]],
            kg_name: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        # Get connected to the local sqlite database (look up its path using database manager-friendly method)
        path_list = os.path.realpath(__file__).split(os.path.sep)
        rtx_index = path_list.index("RTX")
        rtxc = RTXConfiguration()
        sqlite_dir_path = os.path.sep.join([
            *path_list[:(rtx_index + 1)], 'code', 'ARAX', 'KnowledgeSources',
            'KG2c'
        ])
        sqlite_name = rtxc.kg2c_sqlite_path.split('/')[-1]
        sqlite_file_path = f"{sqlite_dir_path}{os.path.sep}{sqlite_name}"
        connection = sqlite3.connect(sqlite_file_path)
        cursor = connection.cursor()
        answer_kg = QGOrganizedKnowledgeGraph()

        # Grab the node objects from sqlite corresponding to the returned node IDs
        num_nodes = sum(
            [len(nodes) for nodes in plover_answer["nodes"].values()])
        start = time.time()
        for qnode_key, node_keys in plover_answer["nodes"].items():
            node_keys_str = "','".join(
                node_keys
            )  # SQL wants ('node1', 'node2') format for string lists
            sql_query = f"SELECT N.node " \
                        f"FROM nodes AS N " \
                        f"WHERE N.id IN ('{node_keys_str}')"
            log.debug(
                f"Looking up {len(plover_answer['nodes'][qnode_key])} returned {qnode_key} node IDs in KG2c sqlite"
            )
            cursor.execute(sql_query)
            rows = cursor.fetchall()
            for row in rows:
                node_as_dict = ujson.loads(row[0])
                node_key, node = self._convert_neo4j_node_to_trapi_node(
                    node_as_dict, kg_name)
                answer_kg.add_node(node_key, node, qnode_key)
        log.debug(
            f"Grabbing {num_nodes} nodes from sqlite and loading into object model took "
            f"{round(time.time() - start, 2)} seconds")

        # Grab the edge objects from sqlite corresponding to the returned edge IDs
        num_edges = sum(
            [len(edges) for edges in plover_answer["edges"].values()])
        start = time.time()
        for qedge_key, edge_keys in plover_answer["edges"].items():
            edge_keys_str = ",".join(
                str(edge_key)
                for edge_key in edge_keys)  # SQL wants (1, 2) format int lists
            sql_query = f"SELECT E.edge " \
                        f"FROM edges AS E " \
                        f"WHERE E.id IN ({edge_keys_str})"
            log.debug(
                f"Looking up {len(plover_answer['edges'][qedge_key])} returned {qedge_key} edge IDs in KG2c sqlite"
            )
            cursor.execute(sql_query)
            rows = cursor.fetchall()
            for row in rows:
                edge_as_dict = ujson.loads(row[0])
                edge_key, edge = self._convert_neo4j_edge_to_trapi_edge(
                    edge_as_dict, dict(), kg_name)
                answer_kg.add_edge(edge_key, edge, qedge_key)
        log.debug(
            f"Grabbing {num_edges} edges from sqlite and loading into object model took "
            f"{round(time.time() - start, 2)} seconds")

        cursor.close()
        connection.close()
        return answer_kg