Exemplo n.º 1
0
def get_preferred_curies(curie: Union[str, List[str]],
                         log: Response) -> Dict[str, Dict[str, str]]:
    curies = convert_string_or_list_to_list(curie)
    try:
        synonymizer = NodeSynonymizer()
        log.debug(
            f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies"
        )
        canonical_curies_dict = synonymizer.get_canonical_curies(curies)
        log.debug(f"Got response back from NodeSynonymizer")
    except Exception:
        tb = traceback.format_exc()
        error_type, error, _ = sys.exc_info()
        log.error(f"Encountered a problem using NodeSynonymizer: {tb}",
                  error_code=error_type.__name__)
        return {}
    else:
        if canonical_curies_dict is not None:
            unrecognized_curies = {
                input_curie
                for input_curie in canonical_curies_dict
                if not canonical_curies_dict.get(input_curie)
            }
            if unrecognized_curies:
                log.warning(
                    f"NodeSynonymizer did not return canonical info for: {unrecognized_curies}"
                )
            return canonical_curies_dict
        else:
            log.error(f"NodeSynonymizer returned None",
                      error_code="NodeNormalizationIssue")
            return {}
Exemplo n.º 2
0
    def _remove_self_edges(kg: DictKnowledgeGraph,
                           edge_to_nodes_map: Dict[str, Dict[str, str]],
                           qedge_id: QEdge, qnodes: List[QNode],
                           log: Response) -> DictKnowledgeGraph:
        log.debug(f"Removing any self-edges from the answer KG")
        # Remove any self-edges
        edges_to_remove = []
        for edge_key, edge in kg.edges_by_qg_id[qedge_id].items():
            if edge.source_id == edge.target_id:
                edges_to_remove.append(edge_key)
        for edge_id in edges_to_remove:
            kg.edges_by_qg_id[qedge_id].pop(edge_id)

        # Remove any nodes that may have been orphaned as a result of removing self-edges
        for qnode in qnodes:
            node_ids_used_by_edges_for_this_qnode_id = set()
            for edge in kg.edges_by_qg_id[qedge_id].values():
                node_ids_used_by_edges_for_this_qnode_id.add(
                    edge_to_nodes_map[edge.id][qnode.id])
            orphan_node_ids_for_this_qnode_id = set(
                kg.nodes_by_qg_id[qnode.id].keys()).difference(
                    node_ids_used_by_edges_for_this_qnode_id)
            for node_id in orphan_node_ids_for_this_qnode_id:
                kg.nodes_by_qg_id[qnode.id].pop(node_id)

        log.debug(
            f"After removing self-edges, answer KG counts are: {eu.get_printable_counts_by_qg_id(kg)}"
        )
        return kg
Exemplo n.º 3
0
    def _load_answers_into_kg(self, neo4j_results: List[Dict[str, List[Dict[str, any]]]], kp: str,  query_graph: QueryGraph,
                              log: Response) -> Tuple[DictKnowledgeGraph, Dict[str, Dict[str, str]]]:
        log.debug(f"Processing query results for edge {query_graph.edges[0].id}")
        final_kg = DictKnowledgeGraph()
        edge_to_nodes_map = dict()
        node_uuid_to_curie_dict = self._build_node_uuid_to_curie_dict(neo4j_results[0]) if kp == "KG1" else dict()

        results_table = neo4j_results[0]
        column_names = [column_name for column_name in results_table]
        for column_name in column_names:
            # Load answer nodes into our knowledge graph
            if column_name.startswith('nodes'):  # Example column name: 'nodes_n00'
                column_qnode_id = column_name.replace("nodes_", "", 1)
                for neo4j_node in results_table.get(column_name):
                    swagger_node = self._convert_neo4j_node_to_swagger_node(neo4j_node, kp)
                    final_kg.add_node(swagger_node, column_qnode_id)
            # Load answer edges into our knowledge graph
            elif column_name.startswith('edges'):  # Example column name: 'edges_e01'
                column_qedge_id = column_name.replace("edges_", "", 1)
                for neo4j_edge in results_table.get(column_name):
                    if kp == "KG2":
                        swagger_edge = self._convert_kg2_edge_to_swagger_edge(neo4j_edge)
                    else:
                        swagger_edge = self._convert_kg1_edge_to_swagger_edge(neo4j_edge, node_uuid_to_curie_dict)

                    # Record which of this edge's nodes correspond to which qnode_id
                    if swagger_edge.id not in edge_to_nodes_map:
                        edge_to_nodes_map[swagger_edge.id] = dict()
                    for qnode in query_graph.nodes:
                        edge_to_nodes_map[swagger_edge.id][qnode.id] = neo4j_edge.get(qnode.id)

                    # Finally add the current edge to our answer knowledge graph
                    final_kg.add_edge(swagger_edge, column_qedge_id)

        return final_kg, edge_to_nodes_map
Exemplo n.º 4
0
    def apply(self, input_message: Message,
              input_parameters: dict) -> Response:

        # Define a default response
        response = Response()
        self.response = response
        self.message = input_message

        # Basic checks on arguments
        if not isinstance(input_parameters, dict):
            response.error("Provided parameters is not a dict",
                           error_code="ParametersNotDict")
            return response

        # Return if any of the parameters generated an error (showing not just the first one)
        if response.status != 'OK':
            return response

        # Store these final parameters for convenience
        response.data['parameters'] = input_parameters
        self.parameters = input_parameters

        response.debug(
            f"Applying Resultifier to Message with parameters {input_parameters}"
        )

        # call _resultify
        self._resultify(describe=False)

        # Clean up the KG (should only contain nodes used in the results)
        self._clean_up_kg()

        # Return the response and done
        return response
Exemplo n.º 5
0
    def apply(self, input_message, input_parameters, response=None):

        #### Define a default response
        if response is None:
            response = Response()
        self.response = response
        self.message = input_message

        #### Basic checks on arguments
        if not isinstance(input_parameters, dict):
            response.error("Provided parameters is not a dict",
                           error_code="ParametersNotDict")
            return response

        # list of actions that have so far been created for ARAX_overlay
        allowable_actions = self.allowable_actions

        # check to see if an action is actually provided
        if 'action' not in input_parameters:
            response.error(
                f"Must supply an action. Allowable actions are: action={allowable_actions}",
                error_code="MissingAction")
        elif input_parameters['action'] not in allowable_actions:
            response.error(
                f"Supplied action {input_parameters['action']} is not permitted. Allowable actions are: {allowable_actions}",
                error_code="UnknownAction")

        #### Return if any of the parameters generated an error (showing not just the first one)
        if response.status != 'OK':
            return response

        # populate the parameters dict
        parameters = dict()
        for key, value in input_parameters.items():
            parameters[key] = value

        #### Store these final parameters for convenience
        response.data['parameters'] = parameters
        self.parameters = parameters

        # convert the action string to a function call (so I don't need a ton of if statements
        getattr(
            self, '_' + self.__class__.__name__ + '__' + parameters['action']
        )(
        )  # thank you https://stackoverflow.com/questions/11649848/call-methods-by-string

        response.debug(
            f"Applying Overlay to Message with parameters {parameters}"
        )  # TODO: re-write this to be more specific about the actual action

        # TODO: add_pubmed_ids
        # TODO: compute_confidence_scores
        # TODO: finish COHD
        # TODO: Jaccard

        #### Return the response and done
        if self.report_stats:  # helper to report information in debug if class self.report_stats = True
            response = self.report_response_stats(response)
        return response
Exemplo n.º 6
0
    def apply(self, input_message, input_parameters):

        #### Define a default response
        response = Response()
        self.response = response
        self.message = input_message

        #### Basic checks on arguments
        if not isinstance(input_parameters, dict):
            response.error("Provided parameters is not a dict", error_code="ParametersNotDict")
            return response

        #### Define a complete set of allowed parameters and their defaults
        parameters = {
            'maximum_results': None,
            'minimum_confidence': None,
            'start_node': 1
        }

        #### Loop through the input_parameters and override the defaults and make sure they are allowed
        for key,value in input_parameters.items():
            if key not in parameters:
                response.error(f"Supplied parameter {key} is not permitted", error_code="UnknownParameter")
            else:
                parameters[key] = value
        #### Return if any of the parameters generated an error (showing not just the first one)
        if response.status != 'OK':
            return response

        #### Store these final parameters for convenience
        response.data['parameters'] = parameters
        self.parameters = parameters


        #### Now apply the filters. Order of operations is probably quite important
        #### Scalar value filters probably come first like minimum_confidence, then complex logic filters
        #### based on edge or node properties, and then finally maximum_results
        response.debug(f"Applying filter to Message with parameters {parameters}")

        #### First, as a test, blow away the results and see if we can recompute them
        #message.n_results = 0
        #message.results = []
        #self.__recompute_results()

        #### Apply scalar value filters first to do easy things and reduce the problem
        # TODO

        #### Complex logic filters probably come next. These may be hard
        # TODO

        #### Finally, if the maximum_results parameter is set, then limit the number of results to that last
        if parameters['maximum_results'] is not None:
           self.__apply_maximum_results_filter(parameters['maximum_results'])

        #### Return the response
        return response
Exemplo n.º 7
0
    def _expand_node(self, qnode_id: str, kp_to_use: str,
                     continue_if_no_results: bool, query_graph: QueryGraph,
                     use_synonyms: bool, synonym_handling: str,
                     log: Response) -> DictKnowledgeGraph:
        # This function expands a single node using the specified knowledge provider
        log.debug(f"Expanding node {qnode_id} using {kp_to_use}")
        query_node = eu.get_query_node(query_graph, qnode_id)
        answer_kg = DictKnowledgeGraph()
        if log.status != 'OK':
            return answer_kg
        if not query_node.curie:
            log.error(
                f"Cannot expand a single query node if it doesn't have a curie",
                error_code="InvalidQuery")
            return answer_kg
        copy_of_qnode = eu.copy_qnode(query_node)

        if use_synonyms:
            self._add_curie_synonyms_to_query_nodes(qnodes=[copy_of_qnode],
                                                    log=log,
                                                    kp=kp_to_use)
        if copy_of_qnode.type in ["protein", "gene"]:
            copy_of_qnode.type = ["protein", "gene"]
        log.debug(f"Modified query node is: {copy_of_qnode.to_dict()}")

        # Answer the query using the proper KP
        valid_kps_for_single_node_queries = ["ARAX/KG1", "ARAX/KG2"]
        if kp_to_use in valid_kps_for_single_node_queries:
            from Expand.kg_querier import KGQuerier
            kg_querier = KGQuerier(log, kp_to_use)
            answer_kg = kg_querier.answer_single_node_query(copy_of_qnode)
            log.info(
                f"Query for node {copy_of_qnode.id} returned results ({eu.get_printable_counts_by_qg_id(answer_kg)})"
            )

            # Make sure all qnodes have been fulfilled (unless we're continuing if no results)
            if log.status == 'OK' and not continue_if_no_results:
                if copy_of_qnode.id not in answer_kg.nodes_by_qg_id or not answer_kg.nodes_by_qg_id[
                        copy_of_qnode.id]:
                    log.error(
                        f"Returned answer KG does not contain any results for QNode {copy_of_qnode.id}",
                        error_code="UnfulfilledQGID")
                    return answer_kg

            if synonym_handling != 'add_all':
                answer_kg, edge_node_usage_map = self._deduplicate_nodes(
                    dict_kg=answer_kg, edge_to_nodes_map={}, log=log)
            return answer_kg
        else:
            log.error(
                f"Invalid knowledge provider: {kp_to_use}. Valid options for single-node queries are "
                f"{', '.join(valid_kps_for_single_node_queries)}",
                error_code="InvalidKP")
            return answer_kg
Exemplo n.º 8
0
 def _merge_answer_into_message_kg(answer_dict_kg: DictKnowledgeGraph,
                                   dict_kg: DictKnowledgeGraph,
                                   log: Response):
     # This function merges an answer KG (from the current edge/node expansion) into the overarching KG
     log.debug("Merging answer into Message.KnowledgeGraph")
     for qnode_id, nodes in answer_dict_kg.nodes_by_qg_id.items():
         for node_key, node in nodes.items():
             dict_kg.add_node(node, qnode_id)
     for qedge_id, edges_dict in answer_dict_kg.edges_by_qg_id.items():
         for edge_key, edge in edges_dict.items():
             dict_kg.add_edge(edge, qedge_id)
Exemplo n.º 9
0
    def _convert_one_hop_query_graph_to_cypher_query(self, query_graph: QueryGraph, enforce_directionality: bool,
                                                     kp: str, log: Response) -> str:
        log.debug(f"Generating cypher for edge {query_graph.edges[0].id} query graph")
        try:
            # Build the match clause
            qedge = query_graph.edges[0]
            source_qnode = eu.get_query_node(query_graph, qedge.source_id)
            target_qnode = eu.get_query_node(query_graph, qedge.target_id)
            qedge_cypher = self._get_cypher_for_query_edge(qedge, enforce_directionality)
            source_qnode_cypher = self._get_cypher_for_query_node(source_qnode)
            target_qnode_cypher = self._get_cypher_for_query_node(target_qnode)
            match_clause = f"MATCH {source_qnode_cypher}{qedge_cypher}{target_qnode_cypher}"

            # Build the where clause
            where_fragments = []
            for qnode in [source_qnode, target_qnode]:
                if qnode.curie:
                    if type(qnode.curie) is str:
                        node_id_where_fragment = f"{qnode.id}.id='{qnode.curie}'"
                    else:
                        node_id_where_fragment = f"{qnode.id}.id in {qnode.curie}"
                    where_fragments.append(node_id_where_fragment)
                if qnode.type and isinstance(qnode.type, list):
                    if "KG2" in kp:
                        node_type_property = "category_label"
                    else:
                        node_type_property = "category"
                    where_fragments.append(f"{qnode.id}.{node_type_property} in {qnode.type}")
            if where_fragments:
                where_clause = f"WHERE {' AND '.join(where_fragments)}"
            else:
                where_clause = ""

            # Build the with clause
            source_qnode_col_name = f"nodes_{source_qnode.id}"
            target_qnode_col_name = f"nodes_{target_qnode.id}"
            qedge_col_name = f"edges_{qedge.id}"
            # This line grabs the edge's ID and a record of which of its nodes correspond to which qnode ID
            extra_edge_properties = "{.*, " + f"id:ID({qedge.id}), {source_qnode.id}:{source_qnode.id}.id, {target_qnode.id}:{target_qnode.id}.id" + "}"
            with_clause = f"WITH collect(distinct {source_qnode.id}) as {source_qnode_col_name}, " \
                          f"collect(distinct {target_qnode.id}) as {target_qnode_col_name}, " \
                          f"collect(distinct {qedge.id}{extra_edge_properties}) as {qedge_col_name}"

            # Build the return clause
            return_clause = f"RETURN {source_qnode_col_name}, {target_qnode_col_name}, {qedge_col_name}"

            cypher_query = f"{match_clause} {where_clause} {with_clause} {return_clause}"
            return cypher_query
        except Exception:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            log.error(f"Problem generating cypher for query. {tb}", error_code=error_type.__name__)
            return ""
Exemplo n.º 10
0
def get_curie_synonyms(curie: Union[str, List[str]],
                       log: Response) -> List[str]:
    curies = convert_string_or_list_to_list(curie)
    try:
        synonymizer = NodeSynonymizer()
        log.debug(
            f"Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(curies)} curies"
        )
        equivalent_curies_dict = synonymizer.get_equivalent_nodes(
            curies, kg_name="KG2")
        log.debug(f"Got response back from NodeSynonymizer")
    except Exception:
        tb = traceback.format_exc()
        error_type, error, _ = sys.exc_info()
        log.error(f"Encountered a problem using NodeSynonymizer: {tb}",
                  error_code=error_type.__name__)
        return []
    else:
        if equivalent_curies_dict is not None:
            curies_missing_info = {
                curie
                for curie in equivalent_curies_dict
                if not equivalent_curies_dict.get(curie)
            }
            if curies_missing_info:
                log.warning(
                    f"NodeSynonymizer did not find any equivalent curies for: {curies_missing_info}"
                )
            equivalent_curies = {
                curie
                for curie_dict in equivalent_curies_dict.values() if curie_dict
                for curie in curie_dict
            }
            all_curies = equivalent_curies.union(set(
                curies))  # Make sure even curies without synonyms are included
            return sorted(list(all_curies))
        else:
            log.error(f"NodeSynonymizer returned None",
                      error_code="NodeNormalizationIssue")
            return []
Exemplo n.º 11
0
 def _answer_query_using_bte(
         self, input_qnode: QNode, output_qnode: QNode, qedge: QEdge,
         answer_kg: DictKnowledgeGraph,
         valid_bte_inputs_dict: Dict[str, Set[str]],
         log: Response) -> Tuple[DictKnowledgeGraph, Set[str]]:
     accepted_curies = set()
     # Send this single-edge query to BTE, input curie by input curie (adding findings to our answer KG as we go)
     for curie in input_qnode.curie:
         # Consider all different combinations of qnode types (can be multiple if gene/protein)
         for input_qnode_type, output_qnode_type in itertools.product(
                 input_qnode.type, output_qnode.type):
             if eu.get_curie_prefix(
                     curie) in valid_bte_inputs_dict['curie_prefixes']:
                 accepted_curies.add(curie)
                 try:
                     loop = asyncio.new_event_loop()
                     seqd = SingleEdgeQueryDispatcher(
                         input_cls=input_qnode_type,
                         output_cls=output_qnode_type,
                         pred=qedge.type,
                         input_id=eu.get_curie_prefix(curie),
                         values=eu.get_curie_local_id(curie),
                         loop=loop)
                     log.debug(
                         f"Sending query to BTE: {curie}-{qedge.type if qedge.type else ''}->{output_qnode_type}"
                     )
                     seqd.query()
                     reasoner_std_response = seqd.to_reasoner_std()
                 except Exception:
                     trace_back = traceback.format_exc()
                     error_type, error, _ = sys.exc_info()
                     log.error(
                         f"Encountered a problem while using BioThings Explorer. {trace_back}",
                         error_code=error_type.__name__)
                     return answer_kg, accepted_curies
                 else:
                     answer_kg = self._add_answers_to_kg(
                         answer_kg, reasoner_std_response, input_qnode.id,
                         output_qnode.id, qedge.id, log)
     return answer_kg, accepted_curies
Exemplo n.º 12
0
 def _add_curie_synonyms_to_query_nodes(qnodes: List[QNode], log: Response,
                                        kp: str):
     log.debug("Looking for query nodes to use curie synonyms for")
     for qnode in qnodes:
         if qnode.curie:
             log.debug(
                 f"Getting curie synonyms for qnode {qnode.id} using the NodeSynonymizer"
             )
             synonymized_curies = eu.get_curie_synonyms(qnode.curie, log)
             log.debug(
                 f"Using {len(synonymized_curies)} equivalent curies for qnode {qnode.id}"
             )
             qnode.curie = synonymized_curies
             if "BTE" not in kp:
                 qnode.type = None  # Important to clear when using synonyms; otherwise we're limited #889
Exemplo n.º 13
0
    def check_for_query_graph_tags(self, message, query_graph_info):

        #### Define a default response
        response = Response()
        self.response = response
        self.message = message
        response.debug(f"Checking KnowledgeGraph for QueryGraph tags")

        #### Get shorter handles
        knowedge_graph = message.knowledge_graph
        nodes = knowedge_graph.nodes
        edges = knowedge_graph.edges

        #### Store number of nodes and edges
        self.n_nodes = len(nodes)
        self.n_edges = len(edges)
        response.debug(f"Found {self.n_nodes} nodes and {self.n_edges} edges")

        #### Clear the maps
        self.node_map = {'by_qnode_id': {}}
        self.edge_map = {'by_qedge_id': {}}

        #### Loop through nodes computing some stats
        n_nodes_with_query_graph_ids = 0
        for node in nodes:
            id = node.id
            if node.qnode_id is None:
                continue
            n_nodes_with_query_graph_ids += 1

            #### Place an entry in the node_map
            if node.qnode_id not in self.node_map['by_qnode_id']:
                self.node_map['by_qnode_id'][node.qnode_id] = {}
            self.node_map['by_qnode_id'][node.qnode_id][id] = 1

        #### Tally the stats
        if n_nodes_with_query_graph_ids == self.n_nodes:
            self.query_graph_id_node_status = 'all nodes have query_graph_ids'
        elif n_nodes_with_query_graph_ids == 0:
            self.query_graph_id_node_status = 'no nodes have query_graph_ids'
        else:
            self.query_graph_id_node_status = 'only some nodes have query_graph_ids'
        response.info(
            f"In the KnowledgeGraph, {self.query_graph_id_node_status}")

        #### Loop through edges computing some stats
        n_edges_with_query_graph_ids = 0
        for edge in edges:
            id = edge.id
            if edge.qedge_id is None:
                continue
            n_edges_with_query_graph_ids += 1

            #### Place an entry in the edge_map
            if edge.qedge_id not in self.edge_map['by_qedge_id']:
                self.edge_map['by_qedge_id'][edge.qedge_id] = {}
            self.edge_map['by_qedge_id'][edge.qedge_id][id] = 1

        if n_edges_with_query_graph_ids == self.n_edges:
            self.query_graph_id_edge_status = 'all edges have query_graph_ids'
        elif n_edges_with_query_graph_ids == 0:
            self.query_graph_id_edge_status = 'no edges have query_graph_ids'
        else:
            self.query_graph_id_edge_status = 'only some edges have query_graph_ids'
        response.info(
            f"In the KnowledgeGraph, {self.query_graph_id_edge_status}")

        #### Return the response
        return response
Exemplo n.º 14
0
    def _deduplicate_nodes(
            dict_kg: DictKnowledgeGraph,
            edge_to_nodes_map: Dict[str, Dict[str, str]], log: Response
    ) -> Tuple[DictKnowledgeGraph, Dict[str, Dict[str, str]]]:
        log.debug(f"Deduplicating nodes")
        deduplicated_kg = DictKnowledgeGraph(
            nodes={qnode_id: dict()
                   for qnode_id in dict_kg.nodes_by_qg_id},
            edges={qedge_id: dict()
                   for qedge_id in dict_kg.edges_by_qg_id})
        updated_edge_to_nodes_map = {
            edge_id: dict()
            for edge_id in edge_to_nodes_map
        }
        curie_mappings = dict()

        # First deduplicate the nodes
        for qnode_id, nodes in dict_kg.nodes_by_qg_id.items():
            # Load preferred curie info from NodeSynonymizer for nodes we haven't seen before
            unmapped_node_ids = set(nodes).difference(set(curie_mappings))
            log.debug(
                f"Getting preferred curies for {qnode_id} nodes returned in this step"
            )
            canonicalized_nodes = eu.get_preferred_curies(
                list(unmapped_node_ids), log) if unmapped_node_ids else dict()
            if log.status != 'OK':
                return deduplicated_kg, updated_edge_to_nodes_map

            for node_id in unmapped_node_ids:
                # Figure out the preferred curie/name for this node
                node = nodes.get(node_id)
                canonicalized_node = canonicalized_nodes.get(node_id)
                if canonicalized_node:
                    preferred_curie = canonicalized_node.get(
                        'preferred_curie', node_id)
                    preferred_name = canonicalized_node.get(
                        'preferred_name', node.name)
                    preferred_type = eu.convert_string_or_list_to_list(
                        canonicalized_node.get('preferred_type', node.type))
                    curie_mappings[node_id] = preferred_curie
                else:
                    # Means the NodeSynonymizer didn't recognize this curie
                    preferred_curie = node_id
                    preferred_name = node.name
                    preferred_type = node.type
                    curie_mappings[node_id] = preferred_curie

                # Add this node into our deduplicated KG as necessary # TODO: merge certain fields, like uri?
                if preferred_curie not in deduplicated_kg.nodes_by_qg_id[
                        qnode_id]:
                    node.id = preferred_curie
                    node.name = preferred_name
                    node.type = preferred_type
                    deduplicated_kg.add_node(node, qnode_id)

        # Then update the edges to reflect changes made to the nodes
        for qedge_id, edges in dict_kg.edges_by_qg_id.items():
            for edge_id, edge in edges.items():
                edge.source_id = curie_mappings.get(edge.source_id)
                edge.target_id = curie_mappings.get(edge.target_id)
                if not edge.source_id or not edge.target_id:
                    log.error(
                        f"Could not find preferred curie mappings for edge {edge_id}'s node(s)"
                    )
                    return deduplicated_kg, updated_edge_to_nodes_map
                deduplicated_kg.add_edge(edge, qedge_id)

                # Update the edge-to-node map for this edge (used down the line for pruning)
                for qnode_id, corresponding_node_id in edge_to_nodes_map[
                        edge_id].items():
                    updated_edge_to_nodes_map[edge_id][
                        qnode_id] = curie_mappings.get(corresponding_node_id)

        log.debug(
            f"After deduplication, answer KG counts are: {eu.get_printable_counts_by_qg_id(deduplicated_kg)}"
        )
        return deduplicated_kg, updated_edge_to_nodes_map
Exemplo n.º 15
0
    def _expand_edge(
            self, qedge: QEdge, kp_to_use: str, dict_kg: DictKnowledgeGraph,
            continue_if_no_results: bool, query_graph: QueryGraph,
            use_synonyms: bool, synonym_handling: str, log: Response
    ) -> Tuple[DictKnowledgeGraph, Dict[str, Dict[str, str]]]:
        # This function answers a single-edge (one-hop) query using the specified knowledge provider
        log.info(f"Expanding edge {qedge.id} using {kp_to_use}")
        answer_kg = DictKnowledgeGraph()
        edge_to_nodes_map = dict()

        # Create a query graph for this edge (that uses synonyms as well as curies found in prior steps)
        edge_query_graph = self._get_query_graph_for_edge(
            qedge, query_graph, dict_kg, use_synonyms, kp_to_use, log)
        if log.status != 'OK':
            return answer_kg, edge_to_nodes_map
        if not any(qnode for qnode in edge_query_graph.nodes if qnode.curie):
            log.error(
                f"Cannot expand an edge for which neither end has any curies. (Could not find curies to use from "
                f"a prior expand step, and neither qnode has a curie specified.)",
                error_code="InvalidQuery")
            return answer_kg, edge_to_nodes_map

        valid_kps = ["ARAX/KG1", "ARAX/KG2", "BTE", "COHD", "NGD"]
        if kp_to_use not in valid_kps:
            log.error(
                f"Invalid knowledge provider: {kp_to_use}. Valid options are {', '.join(valid_kps)}",
                error_code="InvalidKP")
            return answer_kg, edge_to_nodes_map
        else:
            if kp_to_use == 'BTE':
                from Expand.bte_querier import BTEQuerier
                kp_querier = BTEQuerier(log)
            elif kp_to_use == 'COHD':
                from Expand.COHD_querier import COHDQuerier
                kp_querier = COHDQuerier(log)
            elif kp_to_use == 'NGD':
                from Expand.ngd_querier import NGDQuerier
                kp_querier = NGDQuerier(log)
            else:
                from Expand.kg_querier import KGQuerier
                kp_querier = KGQuerier(log, kp_to_use)
            answer_kg, edge_to_nodes_map = kp_querier.answer_one_hop_query(
                edge_query_graph)
            if log.status != 'OK':
                return answer_kg, edge_to_nodes_map
            log.debug(
                f"Query for edge {qedge.id} returned results ({eu.get_printable_counts_by_qg_id(answer_kg)})"
            )

            # Do some post-processing (deduplicate nodes, remove self-edges..)
            if synonym_handling != 'add_all':
                answer_kg, edge_to_nodes_map = self._deduplicate_nodes(
                    answer_kg, edge_to_nodes_map, log)
            if eu.qg_is_fulfilled(edge_query_graph, answer_kg):
                answer_kg = self._remove_self_edges(answer_kg,
                                                    edge_to_nodes_map,
                                                    qedge.id,
                                                    edge_query_graph.nodes,
                                                    log)

            # Make sure our query has been fulfilled (unless we're continuing if no results)
            if not eu.qg_is_fulfilled(edge_query_graph, answer_kg):
                if continue_if_no_results:
                    log.warning(
                        f"No paths were found in {kp_to_use} satisfying this query graph"
                    )
                else:
                    log.error(
                        f"No paths were found in {kp_to_use} satisfying this query graph",
                        error_code="NoResults")

            return answer_kg, edge_to_nodes_map
Exemplo n.º 16
0
    def parse(self, input_actions):

        #### Define a default response
        response = Response()
        response.info(f"Parsing input actions list")

        #### Basic error checking of the input_actions
        if not isinstance(input_actions, list):
            response.error("Provided input actions is not a list",
                           error_code="ActionsNotList")
            return response
        if len(input_actions) == 0:
            response.error("Provided input actions is an empty list",
                           error_code="ActionsListEmpty")
            return response

        #### Iterate through the list, checking the items
        actions = []
        n_lines = 1
        for action in input_actions:
            response.debug(f"Parsing action: {action}")

            # If this line is empty, then skip
            match = re.match(r"\s*$", action)
            if match:
                continue

            # If this line begins with a #, it is a comment, then skip
            match = re.match(r"#", action)
            if match:
                continue

            #### First look for a naked command without parentheses
            match = re.match(r"\s*([A-Za-z_]+)\s*$", action)
            if match is not None:
                action = {
                    "line": n_lines,
                    "command": match.group(1),
                    "parameters": None
                }
                actions.append(action)

            #### Then look for and parse a command with parentheses and a comma-separated parameter list
            if match is None:
                match = re.match(r"\s*([A-Za-z_]+)\((.*)\)\s*$", action)
                if match is not None:
                    command = match.group(1)
                    param_string = match.group(2)

                    #### Split the parameters on comma and process those
                    param_string_list = re.split(",", param_string)
                    parameters = {}

                    #### If a value is of the form key=[value1,value2] special code is needed to recompose that
                    mode = 'normal'
                    list_buffer = []
                    key = ''
                    for param_item in param_string_list:
                        param_item = param_item.strip()
                        if mode == 'normal':

                            #### Split on the first = only (might be = in the value)
                            values = re.split("=", param_item, 1)
                            key = values[0]
                            #### If there isn't a value after an =, then just set to string true
                            value = 'true'
                            if len(values) > 1:
                                value = values[1]
                            key = key.strip()
                            value = value.strip()

                            #### If the value begins with a "[", then this is a list
                            match = re.match(r"\[(.+)$", value)
                            if match:
                                #### If it also ends with a "]", then this is a list of one element
                                match2 = re.match(r"\[(.*)\]$", value)
                                if match2:
                                    if match2.group(1) == '':
                                        parameters[key] = []
                                    else:
                                        parameters[key] = [match2.group(1)]
                                else:
                                    mode = 'in_list'
                                    list_buffer = [match.group(1)]
                            else:
                                parameters[key] = value

                        #### Special processing if we're in the middle of a list
                        elif mode == 'in_list':
                            match = re.match(r"(.*)\]$", param_item)
                            if match:
                                mode = 'normal'
                                list_buffer.append(match.group(1))
                                parameters[key] = list_buffer
                            else:
                                list_buffer.append(param_item)
                        else:
                            eprint("Inconceivable!")
                    if mode == 'in_list':
                        parameters[key] = list_buffer

                    #### Store the parsed result in a dict and add to the list
                    action = {
                        "line": n_lines,
                        "command": command,
                        "parameters": parameters
                    }
                    actions.append(action)
                else:
                    response.error(f"Unable to parse action {action}",
                                   error_code="ActionsListEmpty")
            n_lines += 1

        #### Put the actions in the response data envelope and return
        response.data["actions"] = actions
        return response
Exemplo n.º 17
0
    def _prune_dead_end_paths(dict_kg: DictKnowledgeGraph,
                              full_query_graph: QueryGraph,
                              node_usages_by_edges_map: Dict[str,
                                                             Dict[str,
                                                                  Dict[str,
                                                                       str]]],
                              log: Response):
        # This function removes any 'dead-end' paths from the KG. (Because edges are expanded one-by-one, not all edges
        # found in the last expansion will connect to edges in the next one)
        log.debug(f"Pruning any paths that are now dead ends")

        # Create a map of which qnodes are connected to which other qnodes
        # Example qnode_connections_map: {'n00': {'n01'}, 'n01': {'n00', 'n02'}, 'n02': {'n01'}}
        qnode_connections_map = dict()
        for qnode in full_query_graph.nodes:
            qnode_connections_map[qnode.id] = set()
            for qedge in full_query_graph.edges:
                if qedge.source_id == qnode.id or qedge.target_id == qnode.id:
                    connected_qnode_id = qedge.target_id if qedge.target_id != qnode.id else qedge.source_id
                    qnode_connections_map[qnode.id].add(connected_qnode_id)

        # Create a map of which nodes each node is connected to (organized by the qnode_id they're fulfilling)
        # Example node_usages_by_edges_map: {'e00': {'KG1:111221': {'n00': 'CUI:122', 'n01': 'CUI:124'}}}
        # Example node_connections_map: {'CUI:1222': {'n00': {'DOID:122'}, 'n02': {'UniProtKB:22', 'UniProtKB:333'}}}
        node_connections_map = dict()
        for qedge_id, edges_to_nodes_dict in node_usages_by_edges_map.items():
            current_qedge = next(qedge for qedge in full_query_graph.edges
                                 if qedge.id == qedge_id)
            qnode_ids = [current_qedge.source_id, current_qedge.target_id]
            for edge_id, node_usages_dict in edges_to_nodes_dict.items():
                for current_qnode_id in qnode_ids:
                    connected_qnode_id = next(qnode_id
                                              for qnode_id in qnode_ids
                                              if qnode_id != current_qnode_id)
                    current_node_id = node_usages_dict[current_qnode_id]
                    connected_node_id = node_usages_dict[connected_qnode_id]
                    if current_qnode_id not in node_connections_map:
                        node_connections_map[current_qnode_id] = dict()
                    if current_node_id not in node_connections_map[
                            current_qnode_id]:
                        node_connections_map[current_qnode_id][
                            current_node_id] = dict()
                    if connected_qnode_id not in node_connections_map[
                            current_qnode_id][current_node_id]:
                        node_connections_map[current_qnode_id][
                            current_node_id][connected_qnode_id] = set()
                    node_connections_map[current_qnode_id][current_node_id][
                        connected_qnode_id].add(connected_node_id)

        # Iteratively remove all disconnected nodes until there are none left
        qnode_ids_already_expanded = list(node_connections_map.keys())
        found_dead_end = True
        while found_dead_end:
            found_dead_end = False
            for qnode_id in qnode_ids_already_expanded:
                qnode_ids_should_be_connected_to = qnode_connections_map[
                    qnode_id].intersection(qnode_ids_already_expanded)
                for node_id, node_mappings_dict in node_connections_map[
                        qnode_id].items():
                    # Check if any mappings are even entered for all qnode_ids this node should be connected to
                    if set(node_mappings_dict.keys()
                           ) != qnode_ids_should_be_connected_to:
                        if node_id in dict_kg.nodes_by_qg_id[qnode_id]:
                            dict_kg.nodes_by_qg_id[qnode_id].pop(node_id)
                            found_dead_end = True
                    else:
                        # Verify that at least one of the entered connections still exists (for each connected qnode_id)
                        for connected_qnode_id, connected_node_ids in node_mappings_dict.items(
                        ):
                            if not connected_node_ids.intersection(
                                    set(dict_kg.nodes_by_qg_id[
                                        connected_qnode_id].keys())):
                                if node_id in dict_kg.nodes_by_qg_id[qnode_id]:
                                    dict_kg.nodes_by_qg_id[qnode_id].pop(
                                        node_id)
                                    found_dead_end = True

        # Then remove all orphaned edges
        for qedge_id, edges_dict in node_usages_by_edges_map.items():
            for edge_key, node_mappings in edges_dict.items():
                for qnode_id, used_node_id in node_mappings.items():
                    if used_node_id not in dict_kg.nodes_by_qg_id[qnode_id]:
                        if edge_key in dict_kg.edges_by_qg_id[qedge_id]:
                            dict_kg.edges_by_qg_id[qedge_id].pop(edge_key)
Exemplo n.º 18
0
    def reassign_curies(self, message, input_parameters, describe=False):
        """
        Reassigns CURIEs to the target Knowledge Provider
        :param message: Translator standard Message object
        :type message: Message
        :param input_parameters: Dict of input parameters to control the method
        :type input_parameters: Message
        :return: Response object with execution information
        :rtype: Response
        """

        # #### Internal documentation setup
        allowable_parameters = {
            'knowledge_provider': {
                'Name of the Knowledge Provider CURIE space to map to. Default=KG1. Also currently supported KG2'
            },
            'mismap_result': {
                'Desired action when mapping fails: ERROR or WARNING. Default is ERROR'
            },
        }
        if describe:
            allowable_parameters[
                'dsl_command'] = '`reassign_curies()`'  # can't get this name at run-time, need to manually put it in per https://www.python.org/dev/peps/pep-3130/
            allowable_parameters[
                'brief_description'] = """The `reassign_curies` method reassigns all the CURIEs in the Message QueryGraph to the specified
                knowledge provider. Allowed values are KG1 or KG2. Default is KG1 if not specified."""
            return allowable_parameters

        #### Define a default response
        response = Response()
        self.response = response
        self.message = message

        #### Basic checks on arguments
        if not isinstance(input_parameters, dict):
            response.error("Provided parameters is not a dict",
                           error_code="ParametersNotDict")
            return response

        #### Define a complete set of allowed parameters and their defaults
        parameters = {
            'knowledge_provider': 'KG1',
            'mismap_result': 'ERROR',
        }

        #### Loop through the input_parameters and override the defaults and make sure they are allowed
        for key, value in input_parameters.items():
            if key not in parameters:
                response.error(f"Supplied parameter {key} is not permitted",
                               error_code="UnknownParameter")
            else:
                parameters[key] = value
        #### Return if any of the parameters generated an error (showing not just the first one)
        if response.status != 'OK':
            return response

        #### Store these final parameters for convenience
        response.data['parameters'] = parameters
        self.parameters = parameters

        # Check that the knowledge_provider is valid:
        if parameters['knowledge_provider'] != 'KG1' and parameters[
                'knowledge_provider'] != 'KG2':
            response.error(
                f"Specified knowledge provider must be 'KG1' or 'KG2', not '{parameters['knowledge_provider']}'",
                error_code="UnknownKP")
            return response

        #### Now try to assign the CURIEs
        response.info(
            f"Reassigning the CURIEs in QueryGraph to {parameters['knowledge_provider']} space"
        )

        #### Make sure there's a query_graph already here
        if message.query_graph is None:
            message.query_graph = QueryGraph()
            message.query_graph.nodes = []
            message.query_graph.edges = []
        if message.query_graph.nodes is None:
            message.query_graph.nodes = []

        #### Set up the KGNodeIndex
        kgNodeIndex = KGNodeIndex()

        # Loops through the QueryGraph nodes and adjust them
        for qnode in message.query_graph.nodes:

            # If the CURIE is None, then there's nothing to do
            curie = qnode.curie
            if curie is None:
                continue

            # Map the CURIE to the desired Knowledge Provider
            if parameters['knowledge_provider'] == 'KG1':
                if kgNodeIndex.is_curie_present(curie) is True:
                    mapped_curies = [curie]
                else:
                    mapped_curies = kgNodeIndex.get_KG1_curies(curie)
            elif parameters['knowledge_provider'] == 'KG2':
                if kgNodeIndex.is_curie_present(curie, kg_name='KG2'):
                    mapped_curies = [curie]
                else:
                    mapped_curies = kgNodeIndex.get_curies_and_types(
                        curie, kg_name='KG2')
            else:
                response.error(
                    f"Specified knowledge provider must be 'KG1' or 'KG2', not '{parameters['knowledge_provider']}'",
                    error_code="UnknownKP")
                return response

            # Try to find a new CURIE
            new_curie = None
            if len(mapped_curies) == 0:
                if parameters['mismap_result'] == 'WARNING':
                    response.warning(
                        f"Did not find a mapping for {curie} to KP '{parameters['knowledge_provider']}'. Leaving as is"
                    )
                else:
                    response.error(
                        f"Did not find a mapping for {curie} to KP '{parameters['knowledge_provider']}'. This is an error"
                    )
            elif len(mapped_curies) == 1:
                new_curie = mapped_curies[0]
            else:
                original_curie_is_fine = False
                for potential_curie in mapped_curies:
                    if potential_curie == curie:
                        original_curie_is_fine = True
                if original_curie_is_fine:
                    new_curie = curie
                else:
                    new_curie = mapped_curies[0]
                    response.warning(
                        f"There are multiple possible CURIEs in KP '{parameters['knowledge_provider']}'. Selecting the first one {new_curie}"
                    )

            # If there's no CURIE, then nothing to do
            if new_curie is None:
                pass
            # If it's the same
            elif new_curie == curie:
                response.debug(
                    f"CURIE {curie} is fine for KP '{parameters['knowledge_provider']}'"
                )
            else:
                response.info(
                    f"Remapping CURIE {curie} to {new_curie} for KP '{parameters['knowledge_provider']}'"
                )

        #### Return the response
        return response
Exemplo n.º 19
0
    def add_query_graph_tags(self, message, query_graph_info):

        #### Define a default response
        response = Response()
        self.response = response
        self.message = message
        response.debug(f"Adding temporary QueryGraph ids to KnowledgeGraph")

        #### Get shorter handles
        knowedge_graph = message.knowledge_graph
        nodes = knowedge_graph.nodes
        edges = knowedge_graph.edges

        #### Loop through nodes adding qnode_ids
        for node in nodes:

            #### If there is not qnode_id, then determine what it should be and add it
            if node.qnode_id is None:
                id = node.id
                types = node.type

                #### Find a matching type in the QueryGraph for this node
                if types is None:
                    response.error(
                        f"KnowledgeGraph node {id} does not have a type. This should never be",
                        error_code="NodeMissingType")
                    return response
                n_found_types = 0
                found_type = None
                for node_type in types:
                    if node_type in query_graph_info.node_type_map:
                        n_found_types += 1
                        found_type = node_type

                #### If we did not find exactly one matching type, error out
                if n_found_types == 0:
                    response.error(
                        f"Tried to find types '{types}' for KnowledgeGraph node {id} in query_graph_info, but did not find it",
                        error_code="NodeTypeMissingInQueryGraph")
                    return response
                elif n_found_types > 1:
                    response.error(
                        f"Tried to find types '{types}' for KnowledgeGraph node {id} in query_graph_info, and found multiple matches. This is ambiguous",
                        error_code="MultipleNodeTypesInQueryGraph")
                    return response

                #### Else add it
                node.qnode_id = query_graph_info.node_type_map[found_type]

        #### Loop through the edges adding qedge_ids
        for edge in edges:
            id = edge.id

            #### Check to see if there is already a qedge_id attribute on the edge
            if edge.qedge_id is None:

                #### If there isn't a type or can't find it in the query_graph, error out
                if edge.type is None:
                    response.error(
                        f"KnowledgeGraph edge {id} does not have a type. This should never be",
                        error_code="EdgeMissingType")
                    return response
                if edge.type not in query_graph_info.edge_type_map:
                    response.error(
                        f"Tried to find type '{edge.type}' for KnowledgeGraph node {id} in query_graph_info, but did not find it",
                        error_code="EdgeTypeMissingInQueryGraph")
                    return response

                #### Else add it
                edge.qedge_id = query_graph_info.edge_type_map[edge.type]

        #### Return the response
        return response
Exemplo n.º 20
0
    def add_qnode(self, message, input_parameters, describe=False):
        """
        Adds a new QNode object to the QueryGraph inside the Message object
        :return: Response object with execution information
        :rtype: Response
        """

        # #### Internal documentation setup
        allowable_parameters = {
            'id': {
                'Any string that is unique among all QNode id fields, with recommended format n00, n01, n02, etc.'
            },
            'curie': {
                'Any compact URI (CURIE) (e.g. DOID:9281) (May also be a list like [UniProtKB:P12345,UniProtKB:Q54321])'
            },
            'name': {
                'Any name of a bioentity that will be resolved into a CURIE if possible or result in an error if not (e.g. hypertension, insulin)'
            },
            'type': {
                'Any valid Translator bioentity type (e.g. protein, chemical_substance, disease)'
            },
            'is_set': {
                'If set to true, this QNode represents a set of nodes that are all in common between the two other linked QNodes'
            },
        }
        if describe:
            allowable_parameters[
                'dsl_command'] = '`add_qnode()`'  # can't get this name at run-time, need to manually put it in per https://www.python.org/dev/peps/pep-3130/
            allowable_parameters[
                'brief_description'] = """The `add_qnode` method adds an additional QNode to the QueryGraph in the Message object. Currently
                when a curie or name is specified, this method will only return success if a matching node is found in the KG1/KG2 KGNodeIndex."""
            return allowable_parameters

        #### Define a default response
        response = Response()
        self.response = response
        self.message = message

        #### Basic checks on arguments
        if not isinstance(input_parameters, dict):
            response.error("Provided parameters is not a dict",
                           error_code="ParametersNotDict")
            return response

        #### Define a complete set of allowed parameters and their defaults
        parameters = {
            'id': None,
            'curie': None,
            'name': None,
            'type': None,
            'is_set': None,
        }

        #### Loop through the input_parameters and override the defaults and make sure they are allowed
        for key, value in input_parameters.items():
            if key not in parameters:
                response.error(f"Supplied parameter {key} is not permitted",
                               error_code="UnknownParameter")
            else:
                parameters[key] = value
        #### Return if any of the parameters generated an error (showing not just the first one)
        if response.status != 'OK':
            return response

        #### Store these final parameters for convenience
        response.data['parameters'] = parameters
        self.parameters = parameters

        #### Now apply the filters. Order of operations is probably quite important
        #### Scalar value filters probably come first like minimum_confidence, then complex logic filters
        #### based on edge or node properties, and then finally maximum_results
        response.info(
            f"Adding a QueryNode to Message with parameters {parameters}")

        #### Make sure there's a query_graph already here
        if message.query_graph is None:
            message.query_graph = QueryGraph()
            message.query_graph.nodes = []
            message.query_graph.edges = []
        if message.query_graph.nodes is None:
            message.query_graph.nodes = []

        #### Set up the KGNodeIndex
        kgNodeIndex = KGNodeIndex()

        # Create the QNode and set the id
        qnode = QNode()
        if parameters['id'] is not None:
            id = parameters['id']
        else:
            id = self.__get_next_free_node_id()
        qnode.id = id

        # Set the is_set parameter to what the user selected
        if parameters['is_set'] is not None:
            qnode.is_set = (parameters['is_set'].lower() == 'true')

        #### If the CURIE is specified, try to find that
        if parameters['curie'] is not None:

            # If the curie is a scalar then treat it here as a list of one
            if isinstance(parameters['curie'], str):
                curie_list = [parameters['curie']]
                is_curie_a_list = False
                if parameters['is_set'] is not None and qnode.is_set is True:
                    response.error(
                        f"Specified CURIE '{parameters['curie']}' is a scalar, but is_set=true, which doesn't make sense",
                        error_code="CurieScalarButIsSetTrue")
                    return response

            # Or else set it up as a list
            elif isinstance(parameters['curie'], list):
                curie_list = parameters['curie']
                is_curie_a_list = True
                qnode.curie = []
                if parameters['is_set'] is None:
                    response.warning(
                        f"Specified CURIE '{parameters['curie']}' is a list, but is_set was not set to true. It must be true in this context, so automatically setting to true. Avoid this warning by explictly setting to true."
                    )
                    qnode.is_set = True
                else:
                    if qnode.is_set == False:
                        response.warning(
                            f"Specified CURIE '{parameters['curie']}' is a list, but is_set=false, which doesn't make sense, so automatically setting to true. Avoid this warning by explictly setting to true."
                        )
                        qnode.is_set = True

            # Or if it's neither a list or a string, then error out. This cannot be handled at present
            else:
                response.error(
                    f"Specified CURIE '{parameters['curie']}' is neither a string nor a list. This cannot to handled",
                    error_code="CurieNotListOrScalar")
                return response

            # Loop over the available curies and create the list
            for curie in curie_list:
                response.debug(f"Looking up CURIE {curie} in KgNodeIndex")
                nodes = kgNodeIndex.get_curies_and_types(curie, kg_name='KG2')

                # If nothing was found, we won't bail out, but rather just issue a warning
                if len(nodes) == 0:
                    response.warning(
                        f"A node with CURIE {curie} is not in our knowledge graph KG2, but will continue"
                    )
                    if is_curie_a_list:
                        qnode.curie.append(curie)
                    else:
                        qnode.curie = curie

                else:

                    # FIXME. This is just always taking the first result. This could cause problems for CURIEs with multiple types. Is that possible?
                    # In issue #623 on 2020-06-15 we concluded that we should not specify the type here
                    #qnode.type = nodes[0]['type']

                    # Either append or set the found curie
                    if is_curie_a_list:
                        qnode.curie.append(nodes[0]['curie'])
                    else:
                        qnode.curie = nodes[0]['curie']

                if 'type' in parameters and parameters['type'] is not None:
                    if isinstance(parameters['type'], str):
                        qnode.type = parameters['type']
                    else:
                        qnode.type = parameters['type'][0]

            message.query_graph.nodes.append(qnode)
            return response

        #### If the name is specified, try to find that
        if parameters['name'] is not None:
            response.debug(
                f"Looking up CURIE {parameters['name']} in KgNodeIndex")
            nodes = kgNodeIndex.get_curies_and_types(parameters['name'])
            if len(nodes) == 0:
                nodes = kgNodeIndex.get_curies_and_types(parameters['name'],
                                                         kg_name='KG2')
                if len(nodes) == 0:
                    response.error(
                        f"A node with name '{parameters['name']}'' is not in our knowledge graph",
                        error_code="UnknownCURIE")
                    return response
            qnode.curie = nodes[0]['curie']
            qnode.type = nodes[0]['type']
            message.query_graph.nodes.append(qnode)
            return response

        #### If the type is specified, just add that type. There should be checking that it is legal. FIXME
        if parameters['type'] is not None:
            qnode.type = parameters['type']
            if parameters['is_set'] is not None:
                qnode.is_set = (parameters['is_set'].lower() == 'true')
            message.query_graph.nodes.append(qnode)
            return response

        #### If we get here, it means that all three main parameters are null. Just a generic node with no type or anything. This is okay.
        message.query_graph.nodes.append(qnode)
        return response
Exemplo n.º 21
0
    def assess(self, message):

        #### Define a default response
        response = Response()
        self.response = response
        self.message = message
        response.debug(f"Assessing the QueryGraph for basic information")

        #### Get shorter handles
        query_graph = message.query_graph
        nodes = query_graph.nodes
        edges = query_graph.edges

        #### Store number of nodes and edges
        self.n_nodes = len(nodes)
        self.n_edges = len(edges)
        response.debug(f"Found {self.n_nodes} nodes and {self.n_edges} edges")

        #### Handle impossible cases
        if self.n_nodes == 0:
            response.error(
                "QueryGraph has 0 nodes. At least 1 node is required",
                error_code="QueryGraphZeroNodes")
            return response
        if self.n_nodes == 1 and self.n_edges > 0:
            response.error(
                "QueryGraph may not have edges if there is only one node",
                error_code="QueryGraphTooManyEdges")
            return response
        #if self.n_nodes == 2 and self.n_edges > 1:
        #    response.error("QueryGraph may not have more than 1 edge if there are only 2 nodes", error_code="QueryGraphTooManyEdges")
        #    return response

        #### Loop through nodes computing some stats
        node_info = {}
        self.node_type_map = {}
        for qnode in nodes:
            id = qnode.id
            node_info[id] = {
                'id': id,
                'node_object': qnode,
                'has_curie': False,
                'type': qnode.type,
                'has_type': False,
                'is_set': False,
                'n_edges': 0,
                'n_links': 0,
                'is_connected': False,
                'edges': [],
                'edge_dict': {}
            }
            if qnode.curie is not None: node_info[id]['has_curie'] = True
            if qnode.type is not None: node_info[id]['has_type'] = True
            #if qnode.is_set is not None: node_info[id]['is_set'] = True
            if qnode.id is None:
                response.error(
                    "QueryGraph has a node with no id. This is not permitted",
                    error_code="QueryGraphNodeWithNoId")
                return response

            #### Store lookup of types
            warning_counter = 0
            if qnode.type is None:
                if warning_counter == 0:
                    response.debug(
                        "QueryGraph has nodes with no type. This may cause problems with results inference later"
                    )
                warning_counter += 1
                self.node_type_map['unknown'] = id
            else:
                self.node_type_map[qnode.type] = id

        #### Loop through edges computing some stats
        edge_info = {}
        self.edge_type_map = {}
        unique_links = {}
        for qedge in edges:

            #### Ignore special informationational edges for now.
            virtual_edge_types = {
                'has_normalized_google_distance_with': 1,
                'has_fisher_exact_test_p-value_with': 1,
                'has_jaccard_index_with': 1,
                'probably_treats': 1,
                'has_paired_concept_frequency_with': 1,
                'has_observed_expected_ratio_with': 1,
                'has_chi_square_with': 1
            }
            if qedge.type is not None and qedge.type in virtual_edge_types:
                continue

            id = qedge.id
            edge_info[id] = {
                'id': id,
                'has_type': False,
                'source_id': qedge.source_id,
                'target_id': qedge.target_id,
                'type': None
            }
            #if qnode.type is not None:
            if qedge.type is not None:
                edge_info[id]['has_type'] = True
                edge_info[id]['type'] = qnode.type
            if qedge.id is None:
                response.error(
                    "QueryGraph has a edge with no id. This is not permitted",
                    error_code="QueryGraphEdgeWithNoId")
                return response

            #### Create a unique node link string
            link_string = ','.join(sorted([qedge.source_id, qedge.target_id]))
            if link_string not in unique_links:
                node_info[qedge.source_id]['n_links'] += 1
                node_info[qedge.target_id]['n_links'] += 1
                unique_links[link_string] = 1
                #print(link_string)

            node_info[qedge.source_id]['n_edges'] += 1
            node_info[qedge.target_id]['n_edges'] += 1
            node_info[qedge.source_id]['is_connected'] = True
            node_info[qedge.target_id]['is_connected'] = True
            #node_info[qedge.source_id]['edges'].append(edge_info[id])
            #node_info[qedge.target_id]['edges'].append(edge_info[id])
            node_info[qedge.source_id]['edges'].append(edge_info[id])
            node_info[qedge.target_id]['edges'].append(edge_info[id])
            node_info[qedge.source_id]['edge_dict'][id] = edge_info[id]
            node_info[qedge.target_id]['edge_dict'][id] = edge_info[id]

            #### Store lookup of types
            warning_counter = 0
            edge_type = 'any'
            if qedge.type is None:
                if warning_counter == 0:
                    response.debug(
                        "QueryGraph has edges with no type. This may cause problems with results inference later"
                    )
                warning_counter += 1
            else:
                edge_type = qedge.type

            #### It's not clear yet whether we need to store the whole sentence or just the type
            #type_encoding = f"{node_info[qedge.source_id]['type']}---{edge_type}---{node_info[qedge.target_id]['type']}"
            type_encoding = edge_type
            self.edge_type_map[type_encoding] = id

        #### Loop through the nodes again, trying to identify the start_node and the end_node
        singletons = []
        for node_id, node_data in node_info.items():
            if node_data['n_links'] < 2:
                singletons.append(node_data)
            elif node_data['n_links'] > 2:
                self.is_bifurcated_graph = True
                response.warning(
                    "QueryGraph appears to have a fork in it. This might cause trouble"
                )

        #### Try to identify the start_node and the end_node
        start_node = singletons[0]
        if len(nodes) == 1:
            # Just a single node, fine
            pass
        elif len(singletons) < 2:
            response.warning(
                "QueryGraph appears to be circular or has a strange geometry. This might cause trouble"
            )
        elif len(singletons) > 2:
            response.warning(
                "QueryGraph appears to have a fork in it. This might cause trouble"
            )
        else:
            if singletons[0]['has_curie'] is True and singletons[1][
                    'has_curie'] is False:
                start_node = singletons[0]
            elif singletons[0]['has_curie'] is False and singletons[1][
                    'has_curie'] is True:
                start_node = singletons[1]
            else:
                start_node = singletons[0]
        #### Hmm, that's not very robust against odd graphs. This needs work. FIXME

        self.node_info = node_info
        self.edge_info = edge_info
        self.start_node = start_node

        current_node = start_node
        node_order = [start_node]
        edge_order = []
        edges = current_node['edges']
        while 1:
            #tmp = { 'astate': '1', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges }
            #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2))
            #print('==================================================================================')
            #tmp = input()

            if len(edges) == 0:
                break
            if len(edges) > 1:
                response.error(
                    "Help, two edges at A583. Don't know what to do",
                    error_code="InteralErrorA583")
                return response
            edge_order.append(edges[0])
            previous_node = current_node
            if edges[0]['source_id'] == current_node['id']:
                current_node = node_info[edges[0]['target_id']]
            elif edges[0]['target_id'] == current_node['id']:
                current_node = node_info[edges[0]['source_id']]
            else:
                response.error("Help, edge error A584. Don't know what to do",
                               error_code="InteralErrorA584")
                return response
            node_order.append(current_node)

            #tmp = { 'astate': '2', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges }
            #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2))
            #print('==================================================================================')
            #tmp = input()

            edges = current_node['edges']
            new_edges = []
            for edge in edges:
                if edge['id'] not in previous_node['edge_dict']:
                    new_edges.append(edge)
            edges = new_edges
            if len(edges) == 0:
                break
            #tmp = { 'astate': '3', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges }
            #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2))
            #print('==================================================================================')
            #tmp = input()

        self.node_order = node_order
        self.edge_order = edge_order

        # Create a text rendering of the QueryGraph geometry for matching against a template
        self.query_graph_templates = {
            'simple': '',
            'detailed': {
                'n_nodes': len(node_order),
                'components': []
            }
        }
        node_index = 0
        edge_index = 0
        #print(json.dumps(ast.literal_eval(repr(node_order)),sort_keys=True,indent=2))
        for node in node_order:
            component_id = f"n{node_index:02}"
            content = ''
            component = {
                'component_type': 'node',
                'component_id': component_id,
                'has_curie': node['has_curie'],
                'has_type': node['has_type'],
                'type_value': None
            }
            self.query_graph_templates['detailed']['components'].append(
                component)
            if node['has_curie']:
                content = 'curie'
            if node['has_type'] and node['node_object'].type is not None:
                content = f"type={node['node_object'].type}"
                component['type_value'] = node['node_object'].type
            elif node['has_type']:
                content = 'type'
            template_part = f"{component_id}({content})"
            self.query_graph_templates['simple'] += template_part

            # Since queries with intermediate nodes that are not is_set=true tend to blow up, for now, make them is_set=true unless explicitly set to false
            if node_index > 0 and node_index < (self.n_nodes - 1):
                if 'is_set' not in node or node['is_set'] is None:
                    node['node_object'].is_set = True
                    response.warning(
                        f"Setting unspecified is_set to true for {node['id']} because this will probably lead to a happier result"
                    )
                elif node['is_set'] is True:
                    response.debug(
                        f"Value for is_set is already true for {node['id']} so that's good"
                    )
                elif node['is_set'] is False:
                    #response.info(f"Value for is_set is set to false for intermediate node {node['id']}. This could lead to weird results. Consider setting it to true")
                    response.info(
                        f"Value for is_set is false for intermediate node {node['id']}. Setting to true because this will probably lead to a happier result"
                    )
                    node['node_object'].is_set = True
                #else:
                #    response.error(f"Unrecognized value is_set='{node['is_set']}' for {node['id']}. This should be true or false")

            node_index += 1
            if node_index < self.n_nodes:
                component_id = f"e{edge_index:02}"
                template_part = f"-{component_id}()-"
                self.query_graph_templates['simple'] += template_part
                component = {
                    'component_type': 'edge',
                    'component_id': component_id,
                    'has_curie': False,
                    'has_type': False
                }
                self.query_graph_templates['detailed']['components'].append(
                    component)
                edge_index += 1

        response.debug(
            f"The QueryGraph reference template is: {self.query_graph_templates['simple']}"
        )

        #tmp = { 'node_info': node_info, 'edge_info': edge_info, 'start_node': start_node, 'n_nodes': self.n_nodes, 'n_edges': self.n_edges,
        #    'is_bifurcated_graph': self.is_bifurcated_graph, 'node_order': node_order, 'edge_order': edge_order }
        #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2))
        #sys.exit(0)

        #### Return the response
        return response
Exemplo n.º 22
0
    def apply(self, input_message, input_parameters, response=None):

        if response is None:
            response = Response()
        self.response = response
        self.message = input_message

        # Basic checks on arguments
        if not isinstance(input_parameters, dict):
            response.error("Provided parameters is not a dict",
                           error_code="ParametersNotDict")
            return response

        # Define a complete set of allowed parameters and their defaults
        parameters = self.parameters
        parameters['kp'] = "ARAX/KG1"
        parameters['enforce_directionality'] = False
        parameters['use_synonyms'] = True
        parameters['synonym_handling'] = 'map_back'
        parameters['continue_if_no_results'] = False
        for key, value in input_parameters.items():
            if key and key not in parameters:
                response.error(f"Supplied parameter {key} is not permitted",
                               error_code="UnknownParameter")
            else:
                if type(value) is str and value.lower() == "true":
                    value = True
                elif type(value) is str and value.lower() == "false":
                    value = False
                parameters[key] = value

        # Default to expanding the entire query graph if the user didn't specify what to expand
        if not parameters['edge_id'] and not parameters['node_id']:
            parameters['edge_id'] = [
                edge.id for edge in self.message.query_graph.edges
            ]
            parameters['node_id'] = self._get_orphan_query_node_ids(
                self.message.query_graph)

        if response.status != 'OK':
            return response

        response.data['parameters'] = parameters
        self.parameters = parameters

        # Do the actual expansion
        response.debug(
            f"Applying Expand to Message with parameters {parameters}")
        input_edge_ids = eu.convert_string_or_list_to_list(
            parameters['edge_id'])
        input_node_ids = eu.convert_string_or_list_to_list(
            parameters['node_id'])
        kp_to_use = self.parameters['kp']
        continue_if_no_results = self.parameters['continue_if_no_results']

        # Convert message knowledge graph to dictionary format, for faster processing
        dict_kg = eu.convert_standard_kg_to_dict_kg(
            self.message.knowledge_graph)

        # Expand any specified edges
        if input_edge_ids:
            query_sub_graph = self._extract_query_subgraph(
                input_edge_ids, self.message.query_graph)
            if response.status != 'OK':
                return response
            self.response.debug(
                f"Query graph for this Expand() call is: {query_sub_graph.to_dict()}"
            )

            # Expand the query graph edge by edge (much faster for neo4j queries, and allows easy integration with BTE)
            ordered_qedges_to_expand = self._get_order_to_expand_edges_in(
                query_sub_graph)
            node_usages_by_edges_map = dict()

            for qedge in ordered_qedges_to_expand:
                answer_kg, edge_node_usage_map = self._expand_edge(
                    qedge, kp_to_use, dict_kg, continue_if_no_results,
                    self.message.query_graph)
                if response.status != 'OK':
                    return response
                node_usages_by_edges_map[qedge.id] = edge_node_usage_map

                self._process_and_merge_answer(answer_kg, dict_kg)
                if response.status != 'OK':
                    return response

                self._prune_dead_end_paths(dict_kg, query_sub_graph,
                                           node_usages_by_edges_map)
                if response.status != 'OK':
                    return response

        # Expand any specified nodes
        if input_node_ids:
            for qnode_id in input_node_ids:
                answer_kg = self._expand_node(qnode_id, kp_to_use,
                                              continue_if_no_results,
                                              self.message.query_graph)
                if response.status != 'OK':
                    return response

                self._process_and_merge_answer(answer_kg, dict_kg)
                if response.status != 'OK':
                    return response

        # Convert message knowledge graph back to API standard format
        self.message.knowledge_graph = eu.convert_dict_kg_to_standard_kg(
            dict_kg)

        # Return the response and done
        kg = self.message.knowledge_graph
        response.info(
            f"After Expand, Message.KnowledgeGraph has {len(kg.nodes)} nodes and {len(kg.edges)} edges"
        )
        return response
Exemplo n.º 23
0
    def _add_answers_to_kg(self, answer_kg: DictKnowledgeGraph,
                           reasoner_std_response: Dict[str, any],
                           input_qnode_id: str, output_qnode_id: str,
                           qedge_id: str, log: Response) -> DictKnowledgeGraph:
        kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict(
            reasoner_std_response['results'])
        if reasoner_std_response['knowledge_graph']['edges']:
            remapped_node_ids = dict()
            log.debug(
                f"Got results back from BTE for this query "
                f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)"
            )

            for node in reasoner_std_response['knowledge_graph']['nodes']:
                swagger_node = Node()
                bte_node_id = node.get('id')
                swagger_node.name = node.get('name')
                swagger_node.type = eu.convert_string_or_list_to_list(
                    eu.convert_string_to_snake_case(node.get('type')))

                # Map the returned BTE qg_ids back to the original qnode_ids in our query graph
                bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_id)
                if bte_qg_id == "n0":
                    qnode_id = input_qnode_id
                elif bte_qg_id == "n1":
                    qnode_id = output_qnode_id
                else:
                    log.error("Could not map BTE qg_id to ARAX qnode_id",
                              error_code="UnknownQGID")
                    return answer_kg

                # Find and use the preferred equivalent identifier for this node (if it's an output node)
                if qnode_id == output_qnode_id:
                    if bte_node_id in remapped_node_ids:
                        swagger_node.id = remapped_node_ids.get(bte_node_id)
                    else:
                        equivalent_curies = [
                            f"{prefix}:{eu.get_curie_local_id(local_id)}"
                            for prefix, local_ids in node.get(
                                'equivalent_identifiers').items()
                            for local_id in local_ids
                        ]
                        swagger_node.id = self._get_best_equivalent_bte_curie(
                            equivalent_curies, swagger_node.type[0])
                        remapped_node_ids[bte_node_id] = swagger_node.id
                else:
                    swagger_node.id = bte_node_id

                answer_kg.add_node(swagger_node, qnode_id)

            for edge in reasoner_std_response['knowledge_graph']['edges']:
                swagger_edge = Edge()
                swagger_edge.id = edge.get("id")
                swagger_edge.type = edge.get('type')
                swagger_edge.source_id = remapped_node_ids.get(
                    edge.get('source_id'), edge.get('source_id'))
                swagger_edge.target_id = remapped_node_ids.get(
                    edge.get('target_id'), edge.get('target_id'))
                swagger_edge.is_defined_by = "BTE"
                swagger_edge.provided_by = edge.get('edge_source')
                # Map the returned BTE qg_id back to the original qedge_id in our query graph
                bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge.id)
                if bte_qg_id != "e1":
                    log.error("Could not map BTE qg_id to ARAX qedge_id",
                              error_code="UnknownQGID")
                    return answer_kg
                answer_kg.add_edge(swagger_edge, qedge_id)

        return answer_kg