Пример #1
0
    def _load_answers_into_kg(self, neo4j_results: List[Dict[str,
                                                             List[Dict[str,
                                                                       any]]]],
                              kg_name: str, qg: QueryGraph,
                              log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        log.debug(
            f"Processing query results for edge {next(qedge_key for qedge_key in qg.edges)}"
        )
        final_kg = QGOrganizedKnowledgeGraph()
        node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict(
            neo4j_results[0]) if kg_name == "KG1" else dict()

        results_table = neo4j_results[0]
        column_names = [column_name for column_name in results_table]
        for column_name in column_names:
            # Load answer nodes into our knowledge graph
            if column_name.startswith(
                    'nodes'):  # Example column name: 'nodes_n00'
                column_qnode_key = column_name.replace("nodes_", "", 1)
                for neo4j_node in results_table.get(column_name):
                    node_key, node = self._convert_neo4j_node_to_trapi_node(
                        neo4j_node, kg_name)
                    final_kg.add_node(node_key, node, column_qnode_key)
            # Load answer edges into our knowledge graph
            elif column_name.startswith(
                    'edges'):  # Example column name: 'edges_e01'
                column_qedge_key = column_name.replace("edges_", "", 1)
                for neo4j_edge in results_table.get(column_name):
                    edge_key, edge = self._convert_neo4j_edge_to_trapi_edge(
                        neo4j_edge, node_uuid_to_curie_dict, kg_name)
                    final_kg.add_edge(edge_key, edge, column_qedge_key)

        return final_kg
Пример #2
0
    def _add_answers_to_kg(self, answer_kg: QGOrganizedKnowledgeGraph, reasoner_std_response: Dict[str, any],
                           input_qnode_key: str, output_qnode_key: str, qedge_key: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict(reasoner_std_response['results'])
        if reasoner_std_response['knowledge_graph']['edges']:
            remapped_node_keys = dict()
            log.debug(f"Got results back from BTE for this query "
                      f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)")

            for node in reasoner_std_response['knowledge_graph']['nodes']:
                swagger_node = Node()
                bte_node_key = node.get('id')
                swagger_node.name = node.get('name')
                swagger_node.category = eu.convert_to_list(eu.convert_string_to_snake_case(node.get('type')))

                # Map the returned BTE qg_ids back to the original qnode_keys in our query graph
                bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_key)
                if bte_qg_id == "n0":
                    qnode_key = input_qnode_key
                elif bte_qg_id == "n1":
                    qnode_key = output_qnode_key
                else:
                    log.error("Could not map BTE qg_id to ARAX qnode_key", error_code="UnknownQGID")
                    return answer_kg

                # Find and use the preferred equivalent identifier for this node (if it's an output node)
                if qnode_key == output_qnode_key:
                    if bte_node_key in remapped_node_keys:
                        swagger_node_key = remapped_node_keys.get(bte_node_key)
                    else:
                        equivalent_curies = [f"{prefix}:{eu.get_curie_local_id(local_id)}" for prefix, local_ids in
                                             node.get('equivalent_identifiers').items() for local_id in local_ids]
                        swagger_node_key = self._get_best_equivalent_bte_curie(equivalent_curies, swagger_node.category[0])
                        remapped_node_keys[bte_node_key] = swagger_node_key
                else:
                    swagger_node_key = bte_node_key

                answer_kg.add_node(swagger_node_key, swagger_node, qnode_key)

            for edge in reasoner_std_response['knowledge_graph']['edges']:
                swagger_edge = Edge()
                swagger_edge_key = edge.get("id")
                swagger_edge.predicate = edge.get('type')
                swagger_edge.subject = remapped_node_keys.get(edge.get('source_id'), edge.get('source_id'))
                swagger_edge.object = remapped_node_keys.get(edge.get('target_id'), edge.get('target_id'))
                swagger_edge.attributes = [Attribute(name="provided_by", value=edge.get('edge_source'), type=eu.get_attribute_type("provided_by")),
                                           Attribute(name="is_defined_by", value="BTE", type=eu.get_attribute_type("is_defined_by"))]
                # Map the returned BTE qg_id back to the original qedge_key in our query graph
                bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge_key)
                if bte_qg_id != "e1":
                    log.error("Could not map BTE qg_id to ARAX qedge_key", error_code="UnknownQGID")
                    return answer_kg
                answer_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key)

        return answer_kg
Пример #3
0
    def _load_answers_into_kg(
        self, neo4j_results: List[Dict[str, List[Dict[str, any]]]],
        kg_name: str, qg: QueryGraph, log: ARAXResponse
    ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]:
        log.debug(
            f"Processing query results for edge {next(qedge_key for qedge_key in qg.edges)}"
        )
        final_kg = QGOrganizedKnowledgeGraph()
        edge_to_nodes_map = dict()
        node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict(
            neo4j_results[0]) if kg_name == "KG1" else dict()

        results_table = neo4j_results[0]
        column_names = [column_name for column_name in results_table]
        for column_name in column_names:
            # Load answer nodes into our knowledge graph
            if column_name.startswith(
                    'nodes'):  # Example column name: 'nodes_n00'
                column_qnode_key = column_name.replace("nodes_", "", 1)
                for neo4j_node in results_table.get(column_name):
                    swagger_node_key, swagger_node = self._convert_neo4j_node_to_swagger_node(
                        neo4j_node, kg_name)
                    final_kg.add_node(swagger_node_key, swagger_node,
                                      column_qnode_key)
            # Load answer edges into our knowledge graph
            elif column_name.startswith(
                    'edges'):  # Example column name: 'edges_e01'
                column_qedge_key = column_name.replace("edges_", "", 1)
                for neo4j_edge in results_table.get(column_name):
                    swagger_edge_key, swagger_edge = self._convert_neo4j_edge_to_swagger_edge(
                        neo4j_edge, node_uuid_to_curie_dict, kg_name)

                    # Record which of this edge's nodes correspond to which qnode_key
                    if swagger_edge_key not in edge_to_nodes_map:
                        edge_to_nodes_map[swagger_edge_key] = dict()
                    for qnode_key in qg.nodes:
                        edge_to_nodes_map[swagger_edge_key][
                            qnode_key] = neo4j_edge.get(qnode_key)

                    # Finally add the current edge to our answer knowledge graph
                    final_kg.add_edge(swagger_edge_key, swagger_edge,
                                      column_qedge_key)

        return final_kg, edge_to_nodes_map
Пример #4
0
    def answer_single_node_query(
            self, single_node_qg: QueryGraph) -> QGOrganizedKnowledgeGraph:
        kg_name = self.kg_name
        use_synonyms = self.use_synonyms
        log = self.response
        final_kg = QGOrganizedKnowledgeGraph()
        single_node_qg = eu.make_qg_use_old_types(
            single_node_qg)  # Temporary patch until we switch to KG2.5.1
        qnode_key = next(qnode_key for qnode_key in single_node_qg.nodes)
        qnode = single_node_qg.nodes[qnode_key]

        # Convert qnode curies as needed (either to synonyms or to canonical versions)
        if qnode.id:
            if use_synonyms and kg_name == "KG1":
                qnode.id = eu.get_curie_synonyms(qnode.id, log)
                qnode.category = [
                ]  # Important to clear this, otherwise results are limited (#889)
            elif kg_name == "KG2c":
                qnode.id = eu.get_canonical_curies_list(qnode.id, log)
                qnode.category = [
                ]  # Important to clear this to avoid discrepancies in types for particular concepts

        # Build and run a cypher query to get this node/nodes
        where_clause = f"{qnode_key}.id='{qnode.id}'" if type(
            qnode.id) is str else f"{qnode_key}.id in {qnode.id}"
        cypher_query = f"MATCH {self._get_cypher_for_query_node(qnode_key, single_node_qg, kg_name)} WHERE {where_clause} RETURN {qnode_key}"
        log.info(
            f"Sending cypher query for node {qnode_key} to {kg_name} neo4j")
        results = self._run_cypher_query(cypher_query, kg_name, log)

        # Load the results into swagger object model and add to our answer knowledge graph
        for result in results:
            neo4j_node = result.get(qnode_key)
            swagger_node_key, swagger_node = self._convert_neo4j_node_to_swagger_node(
                neo4j_node, kg_name)
            final_kg.add_node(swagger_node_key, swagger_node, qnode_key)

        # TODO: remove this patch once we switch to KG2.5.0!
        eu.convert_node_and_edge_types_to_new_format(final_kg)

        return final_kg
Пример #5
0
    def _answer_single_node_query_using_neo4j(self, qnode_key: str,
                                              qg: QueryGraph, kg_name: str,
                                              log: ARAXResponse):
        qnode = qg.nodes[qnode_key]
        answer_kg = QGOrganizedKnowledgeGraph()

        # Build and run a cypher query to get this node/nodes
        where_clause = f"{qnode_key}.id='{qnode.id}'" if type(
            qnode.id) is str else f"{qnode_key}.id in {qnode.id}"
        cypher_query = f"MATCH {self._get_cypher_for_query_node(qnode_key, qg)} WHERE {where_clause} RETURN {qnode_key}"
        log.info(
            f"Sending cypher query for node {qnode_key} to {kg_name} neo4j")
        results = self._run_cypher_query(cypher_query, kg_name, log)

        # Load the results into API object model and add to our answer knowledge graph
        for result in results:
            neo4j_node = result.get(qnode_key)
            node_key, node = self._convert_neo4j_node_to_trapi_node(
                neo4j_node, kg_name)
            answer_kg.add_node(node_key, node, qnode_key)

        return answer_kg
Пример #6
0
 def _load_plover_answer_into_object_model(self, plover_answer: Dict[str, Dict[str, Union[set, dict]]],
                                           log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
     answer_kg = QGOrganizedKnowledgeGraph()
     # Load returned nodes into TRAPI object model
     for qnode_key, nodes in plover_answer["nodes"].items():
         num_nodes = len(nodes)
         log.debug(f"Loading {num_nodes} {qnode_key} nodes into TRAPI object model")
         start = time.time()
         for node_key, node_tuple in nodes.items():
             node = self._convert_kg2c_plover_node_to_trapi_node(node_tuple)
             answer_kg.add_node(node_key, node, qnode_key)
         log.debug(f"Loading {num_nodes} {qnode_key} nodes into TRAPI object model took "
                   f"{round(time.time() - start, 2)} seconds")
     # Load returned edges into TRAPI object model
     for qedge_key, edges in plover_answer["edges"].items():
         num_edges = len(edges)
         log.debug(f"Loading {num_edges} edges into TRAPI object model")
         start = time.time()
         for edge_key, edge_tuple in edges.items():
             edge = self._convert_kg2c_plover_edge_to_trapi_edge(edge_tuple)
             answer_kg.add_edge(edge_key, edge, qedge_key)
         log.debug(f"Loading {num_edges} {qedge_key} edges into TRAPI object model took "
                   f"{round(time.time() - start, 2)} seconds")
     return answer_kg
Пример #7
0
    def answer_one_hop_query(
        self, query_graph: QueryGraph
    ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]:
        """
        This function answers a one-hop (single-edge) query using NGD (with the assistance of KG2).
        :param query_graph: A Reasoner API standard query graph.
        :return: A tuple containing:
            1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as
           results for the query. (Dictionary version, organized by QG IDs.)
            2. a map of which nodes fulfilled which qnode_keys for each edge. Example:
              {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}}
        """
        log = self.response
        final_kg = QGOrganizedKnowledgeGraph()
        edge_to_nodes_map = dict()

        # Verify this is a valid one-hop query graph
        self._verify_one_hop_query_graph_is_valid(query_graph, log)
        if log.status != 'OK':
            return final_kg, edge_to_nodes_map

        # Find potential answers using KG2
        log.debug(f"Finding potential answers using KG2")
        qedge_key = next(qedge_key for qedge_key in query_graph.edges)
        qedge = query_graph.edges[qedge_key]
        source_qnode_key = qedge.subject
        target_qnode_key = qedge.object
        source_qnode = query_graph.nodes[source_qnode_key]
        target_qnode = query_graph.nodes[target_qnode_key]
        qedge_params_str = ", ".join(
            list(
                filter(None, [
                    f"key={qedge_key}", f"subject={source_qnode_key}",
                    f"object={target_qnode_key}",
                    self._get_dsl_qedge_type_str(qedge)
                ])))
        source_params_str = ", ".join(
            list(
                filter(None, [
                    f"key={source_qnode_key}",
                    self._get_dsl_qnode_curie_str(source_qnode),
                    self._get_dsl_qnode_category_str(source_qnode)
                ])))
        target_params_str = ", ".join(
            list(
                filter(None, [
                    f"key={target_qnode_key}",
                    self._get_dsl_qnode_curie_str(target_qnode),
                    self._get_dsl_qnode_category_str(target_qnode)
                ])))
        actions_list = [
            f"add_qnode({source_params_str})",
            f"add_qnode({target_params_str})",
            f"add_qedge({qedge_params_str})",
            f"expand(kp=ARAX/KG2)",
            f"return(message=true, store=false)",
        ]
        kg2_response, kg2_message = self._run_arax_query(actions_list, log)
        if log.status != 'OK':
            return final_kg, edge_to_nodes_map

        # Go through those answers from KG2 and calculate ngd for each edge
        log.debug(f"Calculating NGD between each potential node pair")
        kg2_answer_kg = kg2_message.knowledge_graph
        cngd = ComputeNGD(log, kg2_message, None)
        cngd.load_curie_to_pmids_data(kg2_answer_kg.nodes)
        kg2_edge_ngd_map = dict()
        for kg2_edge_key, kg2_edge in kg2_answer_kg.edges.items():
            kg2_node_1_key = kg2_edge.subject
            kg2_node_2_key = kg2_edge.object
            kg2_node_1 = kg2_answer_kg.nodes.get(
                kg2_node_1_key
            )  # These are already canonicalized (default behavior)
            kg2_node_2 = kg2_answer_kg.nodes.get(kg2_node_2_key)
            # Figure out which node corresponds to source qnode (don't necessarily match b/c query was bidirectional)
            if source_qnode_key in kg2_node_1.qnode_keys and target_qnode_key in kg2_node_2.qnode_keys:
                ngd_subject = kg2_node_1_key
                ngd_object = kg2_node_2_key
            else:
                ngd_subject = kg2_node_2_key
                ngd_object = kg2_node_1_key
            ngd_value = cngd.calculate_ngd_fast(ngd_subject, ngd_object)
            kg2_edge_ngd_map[kg2_edge_key] = {
                "ngd_value": ngd_value,
                "subject": ngd_subject,
                "object": ngd_object
            }

        # Create edges for those from KG2 found to have a low enough ngd value
        threshold = 0.5
        log.debug(
            f"Creating edges between node pairs with NGD below the threshold ({threshold})"
        )
        for kg2_edge_key, ngd_info_dict in kg2_edge_ngd_map.items():
            ngd_value = ngd_info_dict['ngd_value']
            if ngd_value is not None and ngd_value < threshold:  # TODO: Make determination of the threshold much more sophisticated
                subject = ngd_info_dict["subject"]
                object = ngd_info_dict["object"]
                ngd_edge_key, ngd_edge = self._create_ngd_edge(
                    ngd_value, subject, object)
                ngd_source_node_key, ngd_source_node = self._create_ngd_node(
                    ngd_edge.subject,
                    kg2_answer_kg.nodes.get(ngd_edge.subject))
                ngd_target_node_key, ngd_target_node = self._create_ngd_node(
                    ngd_edge.object, kg2_answer_kg.nodes.get(ngd_edge.object))
                final_kg.add_edge(ngd_edge_key, ngd_edge, qedge_key)
                final_kg.add_node(ngd_source_node_key, ngd_source_node,
                                  source_qnode_key)
                final_kg.add_node(ngd_target_node_key, ngd_target_node,
                                  target_qnode_key)
                edge_to_nodes_map[ngd_edge_key] = {
                    source_qnode_key: ngd_source_node_key,
                    target_qnode_key: ngd_target_node_key
                }

        return final_kg, edge_to_nodes_map
Пример #8
0
    def _answer_query_using_CHP_client(
            self, query_graph: QueryGraph,
            log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        qedge_key = next(qedge_key for qedge_key in query_graph.edges)
        log.debug(
            f"Processing query results for edge {qedge_key} by using CHP client"
        )
        final_kg = QGOrganizedKnowledgeGraph()
        gene_label_list = ['gene']
        drug_label_list = ['drug', 'chemicalsubstance']
        # use for checking the requirement
        source_pass_nodes = None
        source_category = None
        target_pass_nodes = None
        target_category = None

        qedge = query_graph.edges[qedge_key]
        source_qnode_key = qedge.subject
        target_qnode_key = qedge.object
        source_qnode = query_graph.nodes[source_qnode_key]
        target_qnode = query_graph.nodes[target_qnode_key]

        # check if both ends of edge have no curie
        if (source_qnode.id is None) and (target_qnode.id is None):
            log.error(f"Both ends of edge {qedge_key} are None",
                      error_code="BadEdge")
            return final_kg

        # check if the query nodes are drug or disease
        if source_qnode.id is not None:

            if type(source_qnode.id) is str:
                source_pass_nodes = [source_qnode.id]
            else:
                source_pass_nodes = source_qnode.id
            has_error, pass_nodes, not_pass_nodes = self._check_id(
                source_qnode.id, log)
            if has_error:
                return final_kg
            else:
                if len(not_pass_nodes) == 0 and len(pass_nodes) != 0:
                    source_pass_nodes = pass_nodes
                elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0:
                    source_pass_nodes = pass_nodes
                    if len(not_pass_nodes) == 1:
                        log.warning(
                            f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client"
                        )
                    else:
                        log.warning(
                            f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client"
                        )
                else:
                    if type(source_qnode.id) is str:
                        log.error(
                            f"The curie id of {source_qnode.id} is not allowable based on CHP client",
                            error_code="NotAllowable")
                        return final_kg
                    else:
                        log.error(
                            f"The curie ids of {source_qnode.id} are not allowable based on CHP client",
                            error_code="NotAllowable")
                        return final_kg
        else:
            category = source_qnode.category[0].replace(
                'biolink:', '').replace('_', '').lower()
            source_category = category
            if (category in drug_label_list) or (category in gene_label_list):
                source_category = category
            else:
                log.error(
                    f"The category of query node {source_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene",
                    error_code="CategoryError")
                return final_kg

        if target_qnode.id is not None:

            if type(target_qnode.id) is str:
                target_pass_nodes = [target_qnode.id]
            else:
                target_pass_nodes = target_qnode.id
            has_error, pass_nodes, not_pass_nodes = self._check_id(
                target_qnode.id, log)
            if has_error:
                return final_kg
            else:
                if len(not_pass_nodes) == 0 and len(pass_nodes) != 0:
                    target_pass_nodes = pass_nodes
                elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0:
                    target_pass_nodes = pass_nodes
                    if len(not_pass_nodes) == 1:
                        log.warning(
                            f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client"
                        )
                    else:
                        log.warning(
                            f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client"
                        )
                else:
                    if type(target_qnode.id) is str:
                        log.error(
                            f"The curie id of {target_qnode.id} is not allowable based on CHP client",
                            error_code="CategoryError")
                        return final_kg
                    else:
                        log.error(
                            f"The curie ids of {target_qnode.id} are not allowable based on CHP client",
                            error_code="CategoryError")
                        return final_kg
        else:
            category = target_qnode.category[0].replace(
                'biolink:', '').replace('_', '').lower()
            target_category = category
            if (category in drug_label_list) or (category in gene_label_list):
                target_category = category
            else:
                log.error(
                    f"The category of query node {target_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene",
                    error_code="CategoryError")
                return final_kg

        if (source_pass_nodes is None) and (target_pass_nodes is None):
            return final_kg

        elif (source_pass_nodes is not None) and (target_pass_nodes
                                                  is not None):
            source_dict = dict()
            target_dict = dict()
            if source_pass_nodes[0] in self.allowable_drug_curies:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if target_pass_nodes[0] in self.allowable_drug_curies:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                for (source_curie, target_curie) in itertools.product(
                        source_pass_nodes, target_pass_nodes):

                    if source_category_temp == 'drug':
                        source_curie_temp = source_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        # Let's build a simple single query
                        q = build_query(genes=[target_curie],
                                        therapeutic=source_curie_temp,
                                        disease='MONDO:0007254',
                                        outcome=('EFO:0000714', '>=',
                                                 self.CHP_survival_threshold))

                        response = self.client.query(q)
                        max_probability = self.client.get_outcome_prob(
                            response)
                        swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                            target_curie, source_curie, "paired_with",
                            max_probability)
                    else:
                        target_curie_temp = target_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        # Let's build a simple single query
                        q = build_query(genes=[source_curie],
                                        therapeutic=target_curie_temp,
                                        disease='MONDO:0007254',
                                        outcome=('EFO:0000714', '>=',
                                                 self.CHP_survival_threshold))

                        response = self.client.query(q)
                        max_probability = self.client.get_outcome_prob(
                            response)
                        swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                            source_curie, target_curie, "paired_with",
                            max_probability)

                    source_dict[source_curie] = source_qnode_key
                    target_dict[target_curie] = target_qnode_key

                    # Finally add the current edge to our answer knowledge graph
                    final_kg.add_edge(swagger_edge_key, swagger_edge,
                                      qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg

        elif source_pass_nodes is not None:
            source_dict = dict()
            target_dict = dict()

            if source_pass_nodes[0] in self.allowable_drug_curies:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if target_category in drug_label_list:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                if source_category_temp == 'drug':
                    for source_curie in source_pass_nodes:

                        genes = [
                            curie for curie in self.allowable_gene_curies
                            if self.synonymizer.get_canonical_curies(curie)
                            [curie] is not None and target_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower() for category in list(
                                        self.synonymizer.get_canonical_curies(
                                            curie, return_all_categories=True)
                                        [curie]['all_categories'].keys())
                            ]
                        ]
                        therapeutic = source_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for gene in genes:
                            queries.append(
                                build_query(
                                    genes=[gene],
                                    therapeutic=therapeutic,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, gene in zip(res["message"], genes):
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                gene, source_curie, "paired_with", prob)

                            source_dict[source_curie] = source_qnode_key
                            target_dict[gene] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)
                else:
                    for source_curie in source_pass_nodes:

                        genes = [source_curie]
                        therapeutic = [
                            curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:')
                            for curie in self.allowable_drug_curies
                            if self.synonymizer.get_canonical_curies(
                                curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'))
                            [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')]
                            is not None and target_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower()
                                for category in list(
                                    self.synonymizer.get_canonical_curies(
                                        curie.replace('CHEMBL:',
                                                      'CHEMBL.COMPOUND:'),
                                        return_all_categories=True)[
                                            curie.replace(
                                                'CHEMBL:', 'CHEMBL.COMPOUND:')]
                                    ['all_categories'].keys())
                            ]
                        ]
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for drug in therapeutic:
                            queries.append(
                                build_query(
                                    genes=genes,
                                    therapeutic=drug,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, drug in zip(res["message"], therapeutic):
                            drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:')
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                source_curie, drug, "paired_with", prob)

                            source_dict[source_curie] = source_qnode_key
                            target_dict[drug] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg
        else:
            source_dict = dict()
            target_dict = dict()

            if target_pass_nodes[0] in self.allowable_drug_curies:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category in drug_label_list:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                if target_category_temp == 'drug':
                    for target_curie in target_pass_nodes:

                        genes = [
                            curie for curie in self.allowable_gene_curies
                            if self.synonymizer.get_canonical_curies(curie)
                            [curie] is not None and source_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower() for category in list(
                                        self.synonymizer.get_canonical_curies(
                                            curie, return_all_categories=True)
                                        [curie]['all_categories'].keys())
                            ]
                        ]
                        therapeutic = target_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for gene in genes:
                            queries.append(
                                build_query(
                                    genes=[gene],
                                    therapeutic=therapeutic,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, gene in zip(res["message"], genes):
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                gene, target_curie, "paired_with", prob)

                            source_dict[gene] = source_qnode_key
                            target_dict[target_curie] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                else:
                    for target_curie in target_pass_nodes:

                        genes = [target_curie]
                        therapeutic = [
                            curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:')
                            for curie in self.allowable_drug_curies
                            if self.synonymizer.get_canonical_curies(
                                curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'))
                            [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')]
                            is not None and source_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower()
                                for category in list(
                                    self.synonymizer.get_canonical_curies(
                                        curie.replace('CHEMBL:',
                                                      'CHEMBL.COMPOUND:'),
                                        return_all_categories=True)[
                                            curie.replace(
                                                'CHEMBL:', 'CHEMBL.COMPOUND:')]
                                    ['all_categories'].keys())
                            ]
                        ]
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for drug in therapeutic:
                            queries.append(
                                build_query(
                                    genes=genes,
                                    therapeutic=drug,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, drug in zip(res["message"], therapeutic):
                            drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:')
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                target_curie, drug, "paired_with", prob)

                            source_dict[drug] = source_qnode_key
                            target_dict[target_curie] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg
Пример #9
0
    def answer_one_hop_query(
            self, query_graph: QueryGraph) -> QGOrganizedKnowledgeGraph:
        """
        This function answers a one-hop (single-edge) query using NGD (with the assistance of KG2).
        :param query_graph: A TRAPI query graph.
        :return: An (almost) TRAPI knowledge graph containing all of the nodes and edges returned as
                results for the query. (Organized by QG IDs.)
        """
        log = self.response
        final_kg = QGOrganizedKnowledgeGraph()

        # Verify this is a valid one-hop query graph
        self._verify_one_hop_query_graph_is_valid(query_graph, log)
        if log.status != 'OK':
            return final_kg
        qedge_key = next(qedge_key for qedge_key in query_graph.edges)
        qedge = query_graph.edges[qedge_key]
        if qedge.predicates and not set(qedge.predicates).intersection(
                self.accepted_qedge_predicates):
            log.error(
                f"NGD can only expand qedges with these predicates: {self.accepted_qedge_predicates}. QEdge"
                f" {qedge_key}'s predicate is: {qedge.predicates}",
                error_code="UnsupportedQG")
            return final_kg
        source_qnode_key = qedge.subject
        target_qnode_key = qedge.object

        # Find potential answers using KG2
        log.debug(f"Finding potential answers using KG2")
        modified_qg = copy.deepcopy(query_graph)
        for qedge in modified_qg.edges.values():
            qedge.predicates = None

        request_body = {"message": {"query_graph": modified_qg.to_dict()}}
        kg2_response, kg2_message = self._run_arax_query(request_body, log)
        if log.status != 'OK':
            return final_kg

        # Go through those answers from KG2 and calculate ngd for each edge
        log.debug(f"Calculating NGD between each potential node pair")
        kg2_answer_kg = kg2_message.knowledge_graph
        cngd = ComputeNGD(log, kg2_message, None)
        cngd.load_curie_to_pmids_data(kg2_answer_kg.nodes)
        kg2_edge_ngd_map = dict()
        for kg2_edge_key, kg2_edge in kg2_answer_kg.edges.items():
            kg2_node_1_key = kg2_edge.subject
            kg2_node_2_key = kg2_edge.object
            kg2_node_1 = kg2_answer_kg.nodes.get(
                kg2_node_1_key
            )  # These are already canonicalized (default behavior)
            kg2_node_2 = kg2_answer_kg.nodes.get(kg2_node_2_key)
            # Figure out which node corresponds to source qnode (don't necessarily match b/c query was bidirectional)
            if source_qnode_key in kg2_node_1.qnode_keys and target_qnode_key in kg2_node_2.qnode_keys:
                ngd_subject = kg2_node_1_key
                ngd_object = kg2_node_2_key
            else:
                ngd_subject = kg2_node_2_key
                ngd_object = kg2_node_1_key
            ngd_value, pmid_set = cngd.calculate_ngd_fast(
                ngd_subject, ngd_object)
            kg2_edge_ngd_map[kg2_edge_key] = {
                "ngd_value": ngd_value,
                "subject": ngd_subject,
                "object": ngd_object,
                "pmids": [f"PMID:{pmid}" for pmid in pmid_set]
            }

        # Create edges for those from KG2 found to have a low enough ngd value
        threshold = 0.5
        log.debug(
            f"Creating edges between node pairs with NGD below the threshold ({threshold})"
        )
        for kg2_edge_key, ngd_info_dict in kg2_edge_ngd_map.items():
            ngd_value = ngd_info_dict['ngd_value']
            if ngd_value is not None and ngd_value < threshold:  # TODO: Make determination of the threshold much more sophisticated
                subject = ngd_info_dict["subject"]
                object = ngd_info_dict["object"]
                pmid_list = ngd_info_dict["pmids"]
                ngd_edge_key, ngd_edge = self._create_ngd_edge(
                    ngd_value, subject, object, pmid_list)
                ngd_source_node_key, ngd_source_node = self._create_ngd_node(
                    ngd_edge.subject,
                    kg2_answer_kg.nodes.get(ngd_edge.subject))
                ngd_target_node_key, ngd_target_node = self._create_ngd_node(
                    ngd_edge.object, kg2_answer_kg.nodes.get(ngd_edge.object))
                final_kg.add_edge(ngd_edge_key, ngd_edge, qedge_key)
                final_kg.add_node(ngd_source_node_key, ngd_source_node,
                                  source_qnode_key)
                final_kg.add_node(ngd_target_node_key, ngd_target_node,
                                  target_qnode_key)

        return final_kg
Пример #10
0
    def _grab_nodes_and_edges_from_sqlite(
            self, plover_answer: Dict[str, Dict[str, Set[Union[str, int]]]],
            kg_name: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        # Get connected to the local sqlite database (look up its path using database manager-friendly method)
        path_list = os.path.realpath(__file__).split(os.path.sep)
        rtx_index = path_list.index("RTX")
        rtxc = RTXConfiguration()
        sqlite_dir_path = os.path.sep.join([
            *path_list[:(rtx_index + 1)], 'code', 'ARAX', 'KnowledgeSources',
            'KG2c'
        ])
        sqlite_name = rtxc.kg2c_sqlite_path.split('/')[-1]
        sqlite_file_path = f"{sqlite_dir_path}{os.path.sep}{sqlite_name}"
        connection = sqlite3.connect(sqlite_file_path)
        cursor = connection.cursor()
        answer_kg = QGOrganizedKnowledgeGraph()

        # Grab the node objects from sqlite corresponding to the returned node IDs
        num_nodes = sum(
            [len(nodes) for nodes in plover_answer["nodes"].values()])
        start = time.time()
        for qnode_key, node_keys in plover_answer["nodes"].items():
            node_keys_str = "','".join(
                node_keys
            )  # SQL wants ('node1', 'node2') format for string lists
            sql_query = f"SELECT N.node " \
                        f"FROM nodes AS N " \
                        f"WHERE N.id IN ('{node_keys_str}')"
            log.debug(
                f"Looking up {len(plover_answer['nodes'][qnode_key])} returned {qnode_key} node IDs in KG2c sqlite"
            )
            cursor.execute(sql_query)
            rows = cursor.fetchall()
            for row in rows:
                node_as_dict = ujson.loads(row[0])
                node_key, node = self._convert_neo4j_node_to_trapi_node(
                    node_as_dict, kg_name)
                answer_kg.add_node(node_key, node, qnode_key)
        log.debug(
            f"Grabbing {num_nodes} nodes from sqlite and loading into object model took "
            f"{round(time.time() - start, 2)} seconds")

        # Grab the edge objects from sqlite corresponding to the returned edge IDs
        num_edges = sum(
            [len(edges) for edges in plover_answer["edges"].values()])
        start = time.time()
        for qedge_key, edge_keys in plover_answer["edges"].items():
            edge_keys_str = ",".join(
                str(edge_key)
                for edge_key in edge_keys)  # SQL wants (1, 2) format int lists
            sql_query = f"SELECT E.edge " \
                        f"FROM edges AS E " \
                        f"WHERE E.id IN ({edge_keys_str})"
            log.debug(
                f"Looking up {len(plover_answer['edges'][qedge_key])} returned {qedge_key} edge IDs in KG2c sqlite"
            )
            cursor.execute(sql_query)
            rows = cursor.fetchall()
            for row in rows:
                edge_as_dict = ujson.loads(row[0])
                edge_key, edge = self._convert_neo4j_edge_to_trapi_edge(
                    edge_as_dict, dict(), kg_name)
                answer_kg.add_edge(edge_key, edge, qedge_key)
        log.debug(
            f"Grabbing {num_edges} edges from sqlite and loading into object model took "
            f"{round(time.time() - start, 2)} seconds")

        cursor.close()
        connection.close()
        return answer_kg
Пример #11
0
    def answer_one_hop_query(
        self, query_graph: QueryGraph
    ) -> Tuple[QGOrganizedKnowledgeGraph, Dict[str, Dict[str, str]]]:
        """
        This function answers a one-hop (single-edge) query using the Genetics Provider.
        :param query_graph: A Reasoner API standard query graph.
        :return: A tuple containing:
            1. an (almost) Reasoner API standard knowledge graph containing all of the nodes and edges returned as
           results for the query. (Dictionary version, organized by QG IDs.)
            2. a map of which nodes fulfilled which qnode_keys for each edge. Example:
              {'KG1:111221': {'n00': 'DOID:111', 'n01': 'HP:124'}, 'KG1:111223': {'n00': 'DOID:111', 'n01': 'HP:126'}}
        """
        log = self.response
        include_all_scores = self.response.data['parameters'][
            'include_all_scores']
        final_kg = QGOrganizedKnowledgeGraph()
        edge_to_nodes_map = dict()
        query_graph = eu.make_qg_use_old_types(
            query_graph
        )  # Temporary patch until TRAPI 1.0 KP endpoint is ready

        # Verify this is a valid one-hop query graph and tweak its contents as needed for this KP
        self._verify_one_hop_query_graph_is_valid(query_graph, log)
        if log.status != 'OK':
            return final_kg, edge_to_nodes_map
        modified_query_graph = self._pre_process_query_graph(query_graph, log)
        if log.status != 'OK':
            return final_kg, edge_to_nodes_map
        qedge = next(qedge for qedge in modified_query_graph.edges.values())
        source_qnode_key = qedge.subject
        target_qnode_key = qedge.object

        # Answer the query using the KP and load its answers into our Swagger model
        json_response = self._send_query_to_kp(modified_query_graph, log)
        returned_kg = json_response.get('knowledge_graph')
        if not returned_kg:
            log.warning(
                f"No KG is present in the response from {self.kp_name}")
        else:
            # Build a map of node/edge IDs to qnode/qedge IDs
            qg_id_mappings = self._get_qg_id_mappings_from_results(
                json_response['results'])
            unknown_scores_encountered = set()
            # Populate our final KG with nodes and edges
            for returned_edge in returned_kg['edges']:
                # Skip edges missing a source and/or target ID (have encountered these before)
                if not returned_edge['source_id'] or not returned_edge[
                        'target_id']:
                    log.warning(
                        f"Edge returned from GeneticsKP is lacking a subject and/or object: {returned_edge}."
                        f" Will skip adding this edge to the KG.")
                else:
                    if returned_edge[
                            'score_name'] not in self.score_type_lookup:
                        unknown_scores_encountered.add(
                            returned_edge['score_name'])
                    # Always include edges for integrated scores, but only include magma edges if that flag is set
                    if include_all_scores or returned_edge[
                            'score_name'] == self.magma_score_name:
                        kp_edge_key, swagger_edge = self._create_swagger_edge_from_kp_edge(
                            returned_edge)
                        swagger_edge_key = self._create_unique_edge_key(
                            swagger_edge
                        )  # Convert to an ID that's unique for us
                        for qedge_key in qg_id_mappings['edges'][kp_edge_key]:
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)
                        edge_to_nodes_map[swagger_edge_key] = {
                            source_qnode_key: swagger_edge.subject,
                            target_qnode_key: swagger_edge.object
                        }
            if unknown_scores_encountered:
                log.warning(
                    f"Encountered unknown score(s) from {self.kp_name}: {unknown_scores_encountered}. "
                    f"Not sure what data type to assign these.")
            for returned_node in returned_kg['nodes']:
                if returned_node[
                        'id']:  # Skip any nodes with 'None' for their ID (see discussion in #1154)
                    swagger_node_key, swagger_node = self._create_swagger_node_from_kp_node(
                        returned_node)
                    for qnode_key in qg_id_mappings['nodes'][swagger_node_key]:
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          qnode_key)
                else:
                    log.warning(
                        f"Node returned from {self.kp_name} is lacking an ID: {returned_node}."
                        f" Will skip adding this node to the KG.")

        return final_kg, edge_to_nodes_map