def _create_icees_virtual_edge(self, subject_curie, object_curie, p_value):
     id = f"ICEES:{subject_curie}--{object_curie}"
     # edge = Edge(id=f"ICEES:{subject_curie}--{object_curie}",
     #             type=self.icees_edge_type,
     #             subject_key=subject_curie,
     #             object_key=object_curie,
     #             is_defined_by="ARAX",
     #             provided_by="ICEES+",
     #             relation=self.virtual_relation_label,
     #             qedge_ids=[self.virtual_relation_label],
     #             attributes=[self._create_icees_edge_attribute(p_value)])
     edge_attribute_list = [
         self._create_icees_edge_attribute(p_value),
         EdgeAttribute(name="is_defined_by",
                       value="ARAX",
                       type="ARAX_TYPE_PLACEHOLDER"),
         EdgeAttribute(name="provided_by",
                       value="ICEES+",
                       type="biolink:provided_by"),
         #EdgeAttribute(name="qedge_ids", value=[self.virtual_relation_label])
     ]
     edge = Edge(predicate=self.icees_edge_type,
                 subject=subject_curie,
                 object=object_curie,
                 relation=self.virtual_relation_label,
                 attributes=edge_attribute_list)
     edge.qedge_keys = [self.virtual_relation_label]
     return id, edge
Пример #2
0
 def _create_icees_virtual_edge(self, subject_curie, object_curie, p_value):
     id = f"ICEES:{subject_curie}--{object_curie}"
     # edge = Edge(id=f"ICEES:{subject_curie}--{object_curie}",
     #             type=self.icees_edge_type,
     #             subject_key=subject_curie,
     #             object_key=object_curie,
     #             is_defined_by="ARAX",
     #             provided_by="ICEES+",
     #             relation=self.virtual_relation_label,
     #             qedge_ids=[self.virtual_relation_label],
     #             attributes=[self._create_icees_edge_attribute(p_value)])
     provided_by = "infores:icees"
     edge_attribute_list = [
         self._create_icees_edge_attribute(p_value),
         EdgeAttribute(original_attribute_name="virtual_relation_label",
                       value=self.virtual_relation_label,
                       attribute_type_id="biolink:Unknown"),
         #EdgeAttribute(original_attribute_name="is_defined_by", value="ARAX", attribute_type_id="biolink:Unknown"),
         EdgeAttribute(
             original_attribute_name="provided_by",
             value=provided_by,
             attribute_type_id="biolink:aggregator_knowledge_source",
             attribute_source=provided_by,
             value_type_id="biolink:InformationResource"),
         EdgeAttribute(
             original_attribute_name=None,
             value=True,
             attribute_type_id="biolink:computed_value",
             attribute_source="infores:arax-reasoner-ara",
             value_type_id="metatype:Boolean",
             value_url=None,
             description=
             "This edge is a container for a computed value between two nodes that is not directly attachable to other edges."
         )
         #EdgeAttribute(name="qedge_ids", value=[self.virtual_relation_label])
     ]
     edge = Edge(predicate=self.icees_edge_type,
                 subject=subject_curie,
                 object=object_curie,
                 attributes=edge_attribute_list)
     edge.qedge_keys = [self.virtual_relation_label]
     return id, edge
    def predict_drug_treats_disease(self):
        """
        Iterate over all the edges in the knowledge graph, add the drug-disease treatment probability for appropriate edges
        on the edge_attributes
        :return: response
        """
        parameters = self.parameters
        self.response.debug(f"Computing drug disease treatment probability based on a machine learning model")
        self.response.info(f"Computing drug disease treatment probability based on a machine learning model: See [this publication](https://doi.org/10.1101/765305) for more details about how this is accomplished.")

        attribute_name = "probability_treats"
        attribute_type = "EDAM:data_0951"
        value = 0  # this will be the default value. If the model returns 0, or the default is there, don't include that edge
        url = "https://doi.org/10.1101/765305"

        # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them
        if 'virtual_relation_label' in parameters:
            source_curies_to_decorate = set()
            target_curies_to_decorate = set()
            curie_to_name = dict()
            # identify the nodes that we should be adding virtual edges for
            for node_key, node in self.message.knowledge_graph.nodes.items():
                if hasattr(node, 'qnode_keys'):
                    if parameters['subject_qnode_key'] in node.qnode_keys:
                        if "drug" in node.category or "chemical_substance" in node.category or "biolink:Drug" in node.category or "biolink:ChemicalSubstance" in node.category:  # this is now NOT checked by ARAX_overlay
                            source_curies_to_decorate.add(node_key)
                            curie_to_name[node_key] = node.name
                    if parameters['object_qnode_key'] in node.qnode_keys:
                        if "disease" in node.category or "phenotypic_feature" in node.category or "biolink:Disease" in node.category or "biolink:PhenotypicFeature" in node.category:  # this is now NOT checked by ARAX_overlay
                            target_curies_to_decorate.add(node_key)
                            curie_to_name[node_key] = node.name

            added_flag = False  # check to see if any edges where added
            # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute

            for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate):
                self.response.debug(f"Predicting probability that {curie_to_name[source_curie]} treats {curie_to_name[target_curie]}")
                # create the edge attribute if it can be
                # loop over all equivalent curies and take the highest probability

                max_probability = 0
                converted_source_curie = self.convert_to_trained_curies(source_curie)
                if converted_source_curie is None:
                    continue
                else:
                    preferred_type = converted_source_curie['preferred_type']
                    if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance":
                        converted_source_curie = converted_source_curie['preferred_curie']
                    else:
                        continue
                converted_target_curie = self.convert_to_trained_curies(target_curie)
                if converted_target_curie is None:
                    continue
                else:
                    preferred_type = converted_target_curie['preferred_type']
                    if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature":
                        converted_target_curie = converted_target_curie['preferred_curie']
                    else:
                        continue
                if self.use_prob_db is True:
                    probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie)
                    if probability is not None:
                        if np.isfinite(probability):
                            max_probability = probability
                else:
                    probability = self.pred.prob_single(converted_source_curie, converted_target_curie)
                    if probability is not None:
                        probability = probability[0]
                        if np.isfinite(probability):
                            max_probability = probability
                # if len(res) != 0:
                #     all_probabilities = self.pred.prob_all(res)
                #     if isinstance(all_probabilities, list):
                #         max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                value = max_probability

                #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                #    value = probability[0]
                edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url)  # populate the edge attribute
                if edge_attribute and value != 0:
                    added_flag = True
                    # make the edge, add the attribute

                    # edge properties
                    now = datetime.now()
                    edge_type = "biolink:probably_treats"
                    qedge_keys = [parameters['virtual_relation_label']]
                    relation = parameters['virtual_relation_label']
                    is_defined_by = "ARAX"
                    defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                    provided_by = "ARAX"
                    confidence = None
                    weight = None  # TODO: could make the actual value of the attribute
                    subject_key = source_curie
                    object_key = target_curie

                    # now actually add the virtual edges in
                    id = f"{relation}_{self.global_iter}"
                    self.global_iter += 1
                    edge_attribute_list = [
                        edge_attribute,
                        EdgeAttribute(name="is_defined_by", value=is_defined_by, type="ARAX_TYPE_PLACEHOLDER"),
                        EdgeAttribute(name="defined_datetime", value=defined_datetime, type="metatype:Datetime"),
                        EdgeAttribute(name="provided_by", value=provided_by, type="biolink:provided_by"),
                        #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"),
                        #EdgeAttribute(name="weight", value=weight, type="metatype:Float")
                    ]
                    edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation,
                                attributes=edge_attribute_list)
                    edge.qedge_keys = qedge_keys
                    self.message.knowledge_graph.edges[id] = edge

            # Now add a q_edge the query_graph since I've added an extra edge to the KG
            if added_flag:
                edge_type = "biolink:probably_treats"
                relation = parameters['virtual_relation_label']
                subject_qnode_key = parameters['subject_qnode_key']
                object_qnode_key = parameters['object_qnode_key']
                option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, self.message.query_graph, self.response)
                q_edge = QEdge(predicate=edge_type, relation=relation, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id)
                self.message.query_graph.edges[relation] = q_edge
            return self.response

        else:  # you want to add it for each edge in the KG
            # iterate over KG edges, add the information
            try:
                # map curies to types
                curie_to_type = dict()
                curie_to_name = dict()
                for node_key, node in self.message.knowledge_graph.nodes.items():
                    curie_to_type[node_key] = node.category
                    curie_to_name[node_key] = node.name
                # then iterate over the edges and decorate if appropriate
                for edge_key, edge in self.message.knowledge_graph.edges.items():
                    # Make sure the edge_attributes are not None
                    if not edge.attributes:
                        edge.attributes = []  # should be an array, but why not a list?
                    # now go and actually get the probability
                    source_curie = edge.subject
                    target_curie = edge.object
                    source_types = curie_to_type[source_curie]
                    target_types = curie_to_type[target_curie]
                    if (("drug" in source_types) or ("chemical_substance" in source_types) or ("biolink:Drug" in source_types) or ("biolink:ChemicalSubstance" in source_types)) and (("disease" in target_types) or ("phenotypic_feature" in target_types) or ("biolink:Disease" in target_types) or ("biolink:PhenotypicFeature" in target_types)):
                        # loop over all pairs of equivalent curies and take the highest probability
                        self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}")
                        max_probability = 0
                        converted_source_curie = self.convert_to_trained_curies(source_curie)
                        if converted_source_curie is None:
                            continue
                        else:
                            preferred_type = converted_source_curie['preferred_type']
                            if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance":
                                converted_source_curie = converted_source_curie['preferred_curie']
                            else:
                                continue
                        converted_target_curie = self.convert_to_trained_curies(target_curie)
                        if converted_target_curie is None:
                            continue
                        else:
                            preferred_type = converted_target_curie['preferred_type']
                            if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature":
                                converted_target_curie = converted_target_curie['preferred_curie']
                            else:
                                continue
                        if self.use_prob_db is True:
                            probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie)
                            if probability is not None:
                                if np.isfinite(probability):
                                    max_probability = probability
                        else:
                            probability = self.pred.prob_single(converted_source_curie, converted_target_curie)
                            if probability is not None:
                                probability = probability[0]
                                if np.isfinite(probability):
                                    max_probability = probability
                        # res = list(itertools.product(converted_source_curie, converted_target_curie))
                        # if len(res) != 0:
                        #     all_probabilities = self.pred.prob_all(res)
                        #     if isinstance(all_probabilities, list):
                        #         max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                        value = max_probability

                        #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                        #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                        #    value = probability[0]
                    elif (("drug" in target_types) or ("chemical_substance" in target_types) or ("biolink:Drug" in target_types) or ("biolink:ChemicalSubstance" in target_types)) and (("disease" in source_types) or ("phenotypic_feature" in source_types) or ("biolink:Disease" in source_types) or ("biolink:PhenotypicFeature" in source_types)):
                        #probability = self.pred.prob_single('ChEMBL:' + target_curie[22:], source_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                        #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                        #    value = probability[0]
                        self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}")
                        max_probability = 0
                        converted_source_curie = self.convert_to_trained_curies(source_curie)
                        if converted_source_curie is None:
                            continue
                        else:
                            preferred_type = converted_source_curie['preferred_type']
                            if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature":
                                converted_source_curie = converted_source_curie['preferred_curie']
                            else:
                                continue
                        converted_target_curie = self.convert_to_trained_curies(target_curie)
                        if converted_target_curie is None:
                            continue
                        else:
                            preferred_type = converted_target_curie['preferred_type']
                            if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance":
                                converted_target_curie = converted_target_curie['preferred_curie']
                            else:
                                continue

                        if self.use_prob_db is True:
                            probability = self.pred.get_prob_from_DTD_db(converted_target_curie, converted_source_curie)
                            if probability is not None:
                                if np.isfinite(probability):
                                    max_probability = probability
                        else:
                            probability = self.pred.prob_single(converted_target_curie, converted_source_curie)
                            if probability is not None:
                                probability = probability[0]
                                if np.isfinite(probability):
                                    max_probability = probability
                        # res = list(itertools.product(converted_target_curie, converted_source_curie))
                        # if len(res) != 0:
                        #     all_probabilities = self.pred.prob_all(res)
                        #     if isinstance(all_probabilities, list):
                        #         max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                        value = max_probability

                    else:
                        continue
                    if value != 0:
                        edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url)  # populate the attribute
                        edge.attributes.append(edge_attribute)  # append it to the list of attributes
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong adding the drug disease treatment probability")
            else:
                self.response.info(f"Drug disease treatment probability successfully added to edges")

            return self.response
Пример #4
0
    def compute_jaccard(self):
        message = self.message
        parameters = self.parameters
        self.response.debug(
            f"Computing Jaccard distance and adding this information as virtual edges"
        )
        self.response.info(
            f"Computing Jaccard distance and adding this information as virtual edges"
        )

        self.response.info("Getting all relevant nodes")
        # TODO: should I check that they're connected to the start node, or just assume that they are?
        # TODO: For now, assume that they are
        try:
            intermediate_nodes = set()
            end_node_to_intermediate_node_set = dict(
            )  # keys will be end node curies, values will be tuples the (intermediate curie ids, edge_type)
            for key, node in message.knowledge_graph.nodes.items():
                if parameters['intermediate_node_key'] in node.qnode_keys:
                    intermediate_nodes.add(
                        key)  # add the intermediate node by it's identifier
                # also look for the subject node id
                if parameters['start_node_key'] in node.qnode_keys:
                    subject_node_key = key
                if parameters['end_node_key'] in node.qnode_keys:
                    end_node_to_intermediate_node_set[key] = set()

            # now iterate over the edges to look for the ones we need to add  # TODO: Here, I won't care which direction the edges are pointing
            for edge in message.knowledge_graph.edges.values():
                if edge.subject in intermediate_nodes:  # if subject is intermediate
                    if edge.object in end_node_to_intermediate_node_set:
                        # end_node_to_intermediate_node_set[edge.object].add((edge.subject, edge.predicate))  # add subjectend_node_to_intermediate_node_set[edge.object].add((edge.subject, edge.predicate))
                        # FW: Old way was to add in unique predicate, node id pairs but then count total number of intermediate nodes.
                        # I've now changed this to add only node ids on both but we could change back but instead count all pairs for the demoninator.
                        end_node_to_intermediate_node_set[edge.object].add(
                            edge.subject)
                elif edge.object in intermediate_nodes:  # if object is intermediate
                    if edge.subject in end_node_to_intermediate_node_set:
                        # end_node_to_intermediate_node_set[edge.subject].add((edge.object, edge.predicate))  # add object
                        end_node_to_intermediate_node_set[edge.subject].add(
                            edge.object)

            # now compute the actual jaccard indexes
            denom = len(intermediate_nodes)
            end_node_to_jaccard = dict()
            for end_node_key in end_node_to_intermediate_node_set:
                # TODO: add code here if you care about edge types
                numerator = len(
                    end_node_to_intermediate_node_set[end_node_key])
                jacc = numerator / float(denom)
                end_node_to_jaccard[end_node_key] = jacc

            # now add them all as virtual edges

            # edge properties
            j_iter = 0
            now = datetime.now()
            #edge_type = parameters['virtual_edge_type']
            edge_type = 'biolink:has_jaccard_index_with'
            qedge_keys = [parameters['virtual_relation_label']]
            relation = parameters['virtual_relation_label']
            is_defined_by = "ARAX"
            defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
            provided_by = "infores:arax"
            confidence = None
            weight = None  # TODO: could make the jaccard index the weight
            try:
                subject_key = subject_node_key
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.warning(
                    f"subject node id: {parameters['start_node_key']} not found in the KG. Perhaps the KG is empty?"
                )
                #self.response.error(tb, error_code=error_type.__name__)

            # edge attribute properties
            description = f"Jaccard index based on intermediate query nodes {parameters['intermediate_node_key']}"
            attribute_type = 'EDAM:data_1772'
            name = "jaccard_index"
            url = None

            # now actually add the virtual edges in
            for end_node_key, value in end_node_to_jaccard.items():
                edge_attribute = EdgeAttribute(
                    attribute_type_id=attribute_type,
                    original_attribute_name=name,
                    value=value,
                    value_url=url)
                # try to ensure a unique edge id
                id = f"J{j_iter}"
                # if by chance you get the same id then loop until a unique one is generated
                # probably a btter way of doing this but need to check how ids are generated in expand first
                while id in message.knowledge_graph.edges:
                    id = f"J{j_iter}.{random.randint(10**(9-1), (10**9)-1)}"
                j_iter += 1
                object_key = end_node_key
                # likely will need to fix this for TRAPI 1.0 after being able to test
                # Do these need a attribute type and url?
                edge_attribute_list = [
                    edge_attribute,
                    EdgeAttribute(
                        original_attribute_name="virtual_relation_label",
                        value=relation,
                        attribute_type_id="biolink:Unknown"),
                    #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"),
                    EdgeAttribute(original_attribute_name="defined_datetime",
                                  value=defined_datetime,
                                  attribute_type_id="metatype:Datetime"),
                    EdgeAttribute(
                        original_attribute_name="provided_by",
                        value=provided_by,
                        attribute_type_id="biolink:aggregator_knowledge_source",
                        attribute_source=provided_by,
                        value_type_id="biolink:InformationResource"),
                    EdgeAttribute(
                        original_attribute_name=None,
                        value=True,
                        attribute_type_id="biolink:computed_value",
                        attribute_source="infores:arax-reasoner-ara",
                        value_type_id="metatype:Boolean",
                        value_url=None,
                        description=
                        "This edge is a container for a computed value between two nodes that is not directly attachable to other edges."
                    )
                    #EdgeAttribute(name="confidence", value=confidence, attribute_type_id="biolink:ConfidenceLevel"),
                    #EdgeAttribute(name="weight", value=weight, attribute_type_id="metatype:Float"),
                    #EdgeAttribute(name="qedge_ids", value=qedge_ids)
                ]
                # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key, object_key=object_key,
                #             is_defined_by=is_defined_by, defined_datetime=defined_datetime, provided_by=provided_by,
                #             confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids)
                edge = Edge(predicate=edge_type,
                            subject=subject_key,
                            object=object_key,
                            attributes=edge_attribute_list)
                edge.qedge_keys = qedge_keys
                message.knowledge_graph.edges[id] = edge

            # Now add a q_edge the query_graph since I've added an extra edge to the KG
            subject_qnode_key = parameters['start_node_key']
            object_qnode_key = parameters['end_node_key']
            option_group_id = ou.determine_virtual_qedge_option_group(
                subject_qnode_key, object_qnode_key, self.message.query_graph,
                self.response)
            # q_edge = QEdge(id=relation, type=edge_type, relation=relation, subject_key=subject_qnode_key,
            #                object_key=object_qnode_key, option_group_id=option_group_id)  # TODO: ok to make the id and type the same thing?

            # Does not look to be a way to add option group ids to the new QEdge in TRAPI 1.0? Will error as written now
            q_edge = QEdge(predicates=[edge_type],
                           subject=subject_qnode_key,
                           object=object_qnode_key,
                           option_group_id=option_group_id)
            q_edge.relation = relation
            # Need to fix this for TRAPI 1.0
            self.message.query_graph.edges[relation] = q_edge

            return self.response
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(
                f"Something went wrong when computing the Jaccard index")
            self.response.error(tb, error_code=error_type.__name__)
Пример #5
0
    def fisher_exact_test(self):
        """
        Peform the fisher's exact test to expand or decorate the knowledge graph
        :return: response
        """

        self.response.info(f"Performing Fisher's Exact Test to add p-value to edge attribute of virtual edge")

        # check the input parameters
        if 'subject_qnode_key' not in self.parameters:
            self.response.error(f"The argument 'subject_qnode_key' is required for fisher_exact_test function")
            return self.response
        else:
            subject_qnode_key = self.parameters['subject_qnode_key']
        if 'virtual_relation_label' not in self.parameters:
            self.response.error(f"The argument 'virtual_relation_label' is required for fisher_exact_test function")
            return self.response
        else:
            virtual_relation_label = str(self.parameters['virtual_relation_label'])
        if 'object_qnode_key' not in self.parameters:
            self.response.error(f"The argument 'object_qnode_key' is required for fisher_exact_test function")
            return self.response
        else:
            object_qnode_key = self.parameters['object_qnode_key']
        rel_edge_key = self.parameters['rel_edge_key'] if 'rel_edge_key' in self.parameters else None
        top_n = int(self.parameters['top_n']) if 'top_n' in self.parameters else None
        cutoff = float(self.parameters['cutoff']) if 'cutoff' in self.parameters else None

        # initialize some variables
        nodes_info = {}
        edge_expand_kp = []
        subject_node_list = []
        object_node_dict = {}
        size_of_object = {}
        subject_node_exist = False
        object_node_exist = False
        query_edge_key = set()
        rel_edge_type = set()
        subject_node_category = None
        object_node_category= None

        ## Check if subject_qnode_key and object_qnode_key are in the Query Graph
        try:
            if len(self.message.query_graph.nodes) != 0:
                for node_key in self.message.query_graph.nodes:
                    if node_key == subject_qnode_key:
                        subject_node_exist = True
                        subject_node_category = self.message.query_graph.nodes[node_key].category
                    elif node_key == object_qnode_key:
                        object_node_exist = True
                        object_node_category = self.message.query_graph.nodes[node_key].category
                    else:
                        pass
            else:
                self.response.error(f"There is no query node in QG")
                return self.response
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(f"Something went wrong with retrieving nodes in message QG")
            return self.response

        if subject_node_exist:
            if object_node_exist:
                pass
            else:
                self.response.error(f"No query node with object qnode key {object_qnode_key} detected in QG for Fisher's Exact Test")
                return self.response
        else:
            self.response.error(f"No query node with subject qnode key {subject_qnode_key} detected in QG for Fisher's Exact Test")
            return self.response

        ## Check if there is a query edge connected to both subject_qnode_key and object_qnode_key in the Query Graph
        try:
            if len(self.message.query_graph.edges) != 0:
                for edge_key in self.message.query_graph.edges:
                    if self.message.query_graph.edges[edge_key].subject == subject_qnode_key and self.message.query_graph.edges[edge_key].object == object_qnode_key and self.message.query_graph.edges[edge_key].relation == None:
                        query_edge_key.update([edge_key])  # only actual query edge is added
                    elif self.message.query_graph.edges[edge_key].subject == object_qnode_key and self.message.query_graph.edges[edge_key].object == subject_qnode_key and self.message.query_graph.edges[edge_key].relation == None:
                        query_edge_key.update([edge_key])  # only actual query edge is added
                    else:
                        continue
            else:
                self.response.error(f"There is no query edge in Query Graph")
                return self.response
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(f"Something went wrong with retrieving edges in message QG")
            return self.response

        if len(query_edge_key)!=0:
            if rel_edge_key:
                if rel_edge_key in query_edge_key:
                    pass
                else:
                    self.response.error(f"No query edge with qedge key {rel_edge_key} connected to both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key} detected in QG for Fisher's Exact Test")
                    return self.response
            else:
                pass
        else:
            self.response.error(
                f"No query edge connected to both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key} detected in QG for Fisher's Exact Test")
            return self.response

        ## loop over all nodes in KG and collect their node information
        try:
            count = 0
            for node_key, node in self.message.knowledge_graph.nodes.items():
                nodes_info[node_key] = {'count': count, 'qnode_keys': node.qnode_keys, 'category': self.message.knowledge_graph.nodes[node_key].category, 'edge_index': []}
                count = count + 1
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(f"Something went wrong with retrieving nodes in message KG")
            return self.response

        ## loop over all edges in KG and create subject node list and target node dict based on subject_qnode_key, object_qnode_key as well as rel_edge_id (optional, otherwise all edges are considered)
        try:
            count = 0
            for edge_key, edge in self.message.knowledge_graph.edges.items():

                edge_attribute_dict = {x.name:x.value for x in self.message.knowledge_graph.edges[edge_key].attributes}
                if edge_attribute_dict['is_defined_by'] != 'ARAX':

                    nodes_info[self.message.knowledge_graph.edges[edge_key].subject]['edge_index'].append(count)
                    nodes_info[self.message.knowledge_graph.edges[edge_key].object]['edge_index'].append(count)

                    if rel_edge_key:
                        if rel_edge_key in edge.qedge_keys:
                            if subject_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].subject]['qnode_keys']:
                                edge_expand_kp.append(edge_attribute_dict['is_defined_by'])
                                rel_edge_type.update([self.message.knowledge_graph.edges[edge_key].predicate])
                                subject_node_list.append(self.message.knowledge_graph.edges[edge_key].subject)
                                if self.message.knowledge_graph.edges[edge_key].object not in object_node_dict.keys():
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].object] = {self.message.knowledge_graph.edges[edge_key].subject}
                                else:
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].object].update([self.message.knowledge_graph.edges[edge_key].subject])
                            else:
                                edge_expand_kp.append(edge_attribute_dict['is_defined_by'])
                                rel_edge_type.update([self.message.knowledge_graph.edges[edge_key].predicate])
                                subject_node_list.append(self.message.knowledge_graph.edges[edge_key].object)
                                if self.message.knowledge_graph.edges[edge_key].subject not in object_node_dict.keys():
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].subject] = {self.message.knowledge_graph.edges[edge_key].object}
                                else:
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].subject].update([self.message.knowledge_graph.edges[edge_key].object])
                        else:
                            pass
                    else:
                        if subject_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].subject]['qnode_keys']:
                            if object_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].object]['qnode_keys']:
                                edge_expand_kp.append(edge_attribute_dict['is_defined_by'])
                                subject_node_list.append(self.message.knowledge_graph.edges[edge_key].subject)
                                if self.message.knowledge_graph.edges[edge_key].object not in object_node_dict.keys():
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].object] = {self.message.knowledge_graph.edges[edge_key].subject}
                                else:
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].object].update([self.message.knowledge_graph.edges[edge_key].subject])

                            else:
                                pass
                        elif object_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].subject]['qnode_keys']:
                            if subject_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].object]['qnode_keys']:
                                edge_expand_kp.append(edge_attribute_dict['is_defined_by'])
                                subject_node_list.append(self.message.knowledge_graph.edges[edge_key].object)
                                if self.message.knowledge_graph.edges[edge_key].subject not in object_node_dict.keys():
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].subject] = {self.message.knowledge_graph.edges[edge_key].object}
                                else:
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].subject].update([self.message.knowledge_graph.edges[edge_key].object])

                            else:
                                pass
                        else:
                            pass

                else:
                    pass

                count = count + 1 ## record edge position in message.knowledge_graph

        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(f"Something went wrong with retrieving edges in message KG")
            return self.response

        subject_node_list = list(set(subject_node_list)) ## remove the duplicate subject node key

        ## check if there is no subject node in message KG
        if len(subject_node_list) == 0:
            self.response.error(f"No subject node found in message KG for Fisher's Exact Test")
            return self.response

        ## check if there is no object node in message KG
        if len(object_node_dict) == 0:
            self.response.error(f"No object node found in message KG for Fisher's Exact Test")
            return self.response

        ## check if subject node has more than one type. If so, throw an error
        if subject_node_category is None:
            self.response.error(f"Subject node with qnode key {subject_qnode_key} was set to None in Query Graph. Please specify the node type")
            return self.response
        else:
            pass

        ## check if object node has more than one type. If so, throw an error
        if object_node_category is None:
            self.response.error(f"Object node with qnode key {object_qnode_key} was set to None in Query Graph. Please specify the node type")
            return self.response
        else:
            pass

        ##check how many kps were used in message KG. If more than one, the one with the max number of edges connnected to both subject nodes and object nodes was used
        if len(collections.Counter(edge_expand_kp))==1:
            kp = edge_expand_kp[0]
        else:
            occurrences = collections.Counter(edge_expand_kp)
            max_index = max([(value, index) for index, value in enumerate(occurrences.values())])[1] # if there are more than one kp having the maximum number of edges, then the last one based on alphabetical order will be chosen.
            kp = list(occurrences.keys())[max_index]
            self.response.debug(f"{occurrences}")
            self.response.warning(f"More than one knowledge provider was detected to be used for expanding the edges connected to both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key}")
            self.response.warning(f"The knowledge provider {kp} was used to calculate Fisher's exact test because it has the maximum number of edges both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key}")

        ## Print out some information used to calculate FET
        if len(subject_node_list) == 1:
            self.response.debug(f"{len(subject_node_list)} subject node with qnode key {subject_qnode_key} and node type {subject_node_category} was found in message KG and used to calculate Fisher's Exact Test")
        else:
            self.response.debug(f"{len(subject_node_list)} subject nodes with qnode key {subject_qnode_key} and node type {subject_node_category} was found in message KG and used to calculate Fisher's Exact Test")
        if len(object_node_dict) == 1:
            self.response.debug(f"{len(object_node_dict)} object node with qnode key {object_qnode_key} and node type {object_node_category} was found in message KG and used to calculate Fisher's Exact Test")
        else:
            self.response.debug(f"{len(object_node_dict)} object nodes with qnode key {object_qnode_key} and node type {object_node_category} was found in message KG and used to calculate Fisher's Exact Test")


        # find all nodes with the same type of 'subject_qnode_key' nodes in specified KP ('ARAX/KG1','ARAX/KG2','BTE') that are adjacent to target nodes
        use_parallel = False

        if not use_parallel:
            # query adjacent node in one DSL command by providing a list of query nodes to add_qnode()
            if rel_edge_key:
                if len(rel_edge_type) == 1:  # if the edge with rel_edge_key has only type, we use this rel_edge_predicate to find all subject nodes in KP
                    self.response.debug(f"{kp} and edge relation type {list(rel_edge_type)[0]} were used to calculate total object nodes in Fisher's Exact Test")
                    result = self.query_size_of_adjacent_nodes(node_curie=list(object_node_dict.keys()), source_type=object_node_category, adjacent_type=subject_node_category, kp = kp, rel_type=list(rel_edge_type)[0], use_cypher_command=False)
                else:  # if the edge with rel_edge_key has more than one type, we ignore the edge predicate and use all categories to find all subject nodes in KP
                    self.response.warning(f"The edges with specified qedge key {rel_edge_key} have more than one category, we ignore the edge predicate and use all categories to calculate Fisher's Exact Test")
                    self.response.debug(f"{kp} was used to calculate total object nodes in Fisher's Exact Test")
                    result = self.query_size_of_adjacent_nodes(node_curie=list(object_node_dict.keys()), source_type=object_node_category, adjacent_type=subject_node_category, kp=kp, rel_type=None, use_cypher_command=False)
            else:  # if no rel_edge_key is specified, we ignore the edge predicate and use all categories to find all subject nodes in KP
                self.response.debug(f"{kp} was used to calculate total object nodes in Fisher's Exact Test")
                result = self.query_size_of_adjacent_nodes(node_curie=list(object_node_dict.keys()), source_type=object_node_category, adjacent_type=subject_node_category, kp=kp, rel_type=None, use_cypher_command=False)

            if result is None:
                return self.response ## Something wrong happened for querying the adjacent nodes
            else:
                res, removed_nodes = result
                if len(removed_nodes)==0:
                    size_of_object = res
                else:
                    if len(removed_nodes) == 1:
                        self.response.warning(f"One object node which is {removed_nodes[0]} can't find its neighbors. This node will be ignored for FET calculation.")
                    else:
                        self.response.warning(f"{len(removed_nodes)} object nodes which are {removed_nodes} can't find its neighbors. These nodes will be ignored for FET calculation.")
                    for node in removed_nodes:
                        del object_node_dict[node]
                    size_of_object = res
        else:
            # query adjacent node for query nodes one by one in parallel
            if rel_edge_key:
                if len(rel_edge_type) == 1:  # if the edge with rel_edge_key has only type, we use this rel_edge_predicate to find all subject nodes in KP
                    self.response.debug(f"{kp} and edge relation type {list(rel_edge_type)[0]} were used to calculate total adjacent nodes in Fisher's Exact Test")
                    parameter_list = [(node, object_node_category, subject_node_category, kp, list(rel_edge_type)[0]) for node in list(object_node_dict.keys())]
                else:  # if the edge with rel_edge_key has more than one type, we ignore the edge type and use all types to find all source nodes in KP
                    self.response.warning(f"The edges with specified qedge key {rel_edge_key} have more than one type, we ignore the edge type and use all types to calculate Fisher's Exact Test")
                    self.response.debug(f"{kp} was used to calculate total adjacent nodes in Fisher's Exact Test")
                    parameter_list = [(node, object_node_category, subject_node_category, kp, None) for node in list(object_node_dict.keys())]
            else:  # if no rel_edge_key is specified, we ignore the edge type and use all types to find all source nodes in KP
                self.response.debug(f"{kp} was used to calculate total adjacent nodes in Fisher's Exact Test")
                parameter_list = [(node, object_node_category, subject_node_category, kp, None) for node in list(object_node_dict.keys())]

            ## get the count of all nodes with the type of 'subject_qnode_key' nodes in KP for each target node in parallel
            try:
                with multiprocessing.Pool() as executor:
                    object_count_res = [elem for elem in executor.map(self._query_size_of_adjacent_nodes_parallel, parameter_list)]
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong with querying adjacent nodes in parallel")
                return self.response

            if any([type(elem) is list for elem in object_count_res]):
                for msg in [elem2 for elem1 in object_count_res if type(elem1) is list for elem2 in elem1]:
                    if type(msg) is tuple:
                        self.response.error(msg[0], error_code=msg[1])
                    else:
                        self.response.error(msg)
                return self.response  ## Something wrong happened for querying the adjacent nodes
            else:
                for index in range(len(object_node_dict)):
                    node = list(object_node_dict.keys())[index]
                    size_of_object[node] = object_count_res[index]

        if len(object_node_dict) != 0:
            ## Based on KP detected in message KG, find the total number of node with the same type of source node
            if kp=='ARAX/KG1':
                size_of_total = self.size_of_given_type_in_KP(node_type=subject_node_category, use_cypher_command=False, kg='KG1')
                if size_of_total != 0:
                    self.response.debug(f"ARAX/KG1 and cypher query were used to calculate total number of node with the same type of source node in Fisher's Exact Test")
                    self.response.debug(f"Total {size_of_total} unique concepts with node category {subject_node_category} was found in ARAX/KG1")
                else:
                    size_of_total = self.size_of_given_type_in_KP(node_type=subject_node_category, use_cypher_command=False, kg='KG2') ## If cypher query fails, then try kgNodeIndex
                    if size_of_total==0:
                        self.response.error(f"Both KG1 and KG2 have 0 node with the same type of subject node with qnode key {subject_qnode_key}")
                        return self.response
                    else:
                        self.response.debug(f"Since KG1 can't find the any nodes with node category {subject_node_category}, ARAX/KG2C were used to calculate total number of node with the same type of source node in Fisher's Exact Test")
                        self.response.debug(f"Total {size_of_total} unique concepts with node category {subject_node_category} was found in ARAX/KG2C")

            elif kp=='ARAX/KG2' or kp == 'ARAX/KG2c':
                ## check KG1 first as KG2 might have many duplicates. If KG1 is 0, then check KG2
                size_of_total = self.size_of_given_type_in_KP(node_type=subject_node_category, use_cypher_command=False, kg='KG2') ## Try cypher query first
                self.response.debug(f"ARAX/KG2C were used to calculate total number of node with the same type of source node in Fisher's Exact Test")
                self.response.debug(f"Total {size_of_total} unique concepts with node category {subject_node_category} was found in ARAX/KG2C")

            else:
                self.response.error(f"Only KG1 or KG2 is allowable to calculate the Fisher's exact test temporally")
                return self.response

            size_of_query_sample = len(subject_node_list)

            self.response.debug(f"Computing Fisher's Exact Test P-value")
            # calculate FET p-value for each target node in parallel
            del_list = []
            parameter_list = []
            for node in object_node_dict:
                if size_of_object[node]-len(object_node_dict[node]) < 0:
                    del_list.append(node)
                    self.response.warning(f"Skipping node {node} to calculate FET p-value due to issue897 (which causes negative value).")
                    continue
                else:
                    parameter_list += [(node, len(object_node_dict[node]), size_of_object[node]-len(object_node_dict[node]), size_of_query_sample - len(object_node_dict[node]), (size_of_total - size_of_object[node]) - (size_of_query_sample - len(object_node_dict[node])))]

            for del_node in del_list:
                del object_node_dict[del_node]
            # parameter_list = [(node, len(target_node_dict[node]), size_of_target[node]-len(target_node_dict[node]), size_of_query_sample - len(target_node_dict[node]), (size_of_total - size_of_target[node]) - (size_of_query_sample - len(target_node_dict[node]))) for node in target_node_dict]

            try:
                with multiprocessing.Pool() as executor:
                    FETpvalue_list = [elem for elem in executor.map(self._calculate_FET_pvalue_parallel, parameter_list)]
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong with computing Fisher's Exact Test P-value")
                return self.response

            if any([type(elem) is list for elem in FETpvalue_list]):
                for msg in [elem2 for elem1 in FETpvalue_list if type(elem1) is list for elem2 in elem1]:
                    if type(msg) is tuple:
                        self.response.error(msg[0], error_code=msg[1])
                    else:
                        self.response.error(msg)
                return self.response
            else:
                output = dict(FETpvalue_list)

            # check if the results need to be filtered
            output = dict(sorted(output.items(), key=lambda x: x[1]))
            if cutoff:
                output = dict(filter(lambda x: x[1] < cutoff, output.items()))
            else:
                pass
            if top_n:
                output = dict(list(output.items())[:top_n])
            else:
                pass

            # add the virtual edge with FET result to message KG
            self.response.debug(f"Adding virtual edge with FET result to message KG")
            count = 0
            for index, value in enumerate([(virtual_relation_label, output[adj], node, adj) for adj in object_node_dict if adj in output.keys() for node in object_node_dict[adj]], 1):

                edge_attribute_list =  [
                    EdgeAttribute(type="EDAM:data_1669", name="fisher_exact_test_p-value", value=str(value[1]), url=None),
                    EdgeAttribute(name="is_defined_by", value="ARAX", type="ARAX_TYPE_PLACEHOLDER"),
                    EdgeAttribute(name="defined_datetime", value=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), type="metatype:Datetime"),
                    EdgeAttribute(name="provided_by", value="ARAX", type="biolink:provided_by"),
                    #EdgeAttribute(name="confidence", value=None, type="biolink:ConfidenceLevel"),
                    #EdgeAttribute(name="weight", value=None, type="metatype:Float")
                ]
                edge_id = f"{value[0]}_{index}"
                edge = Edge(predicate='biolink:has_fisher_exact_test_p-value_with', subject=value[2], object=value[3], relation=value[0],
                            attributes=edge_attribute_list)
                edge.qedge_keys = [value[0]]

                self.message.knowledge_graph.edges[edge_id] = edge

                count = count + 1

            self.response.debug(f"{count} new virtual edges were added to message KG")

            # add the virtual edge to message QG
            if count > 0:
                self.response.debug(f"Adding virtual edge to message QG")
                edge_type = "biolink:has_fisher_exact_test_p-value_with"
                option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key,
                                                                          self.message.query_graph, self.response)
                qedge_id = virtual_relation_label
                q_edge = QEdge(predicate=edge_type, relation=virtual_relation_label,
                               subject=subject_qnode_key, object=object_qnode_key,
                               option_group_id=option_group_id)
                self.message.query_graph.edges[qedge_id] = q_edge
                self.response.debug(f"One virtual edge was added to message QG")

        return self.response
Пример #6
0
    def compute_ngd(self):
        """
        Iterate over all the edges in the knowledge graph, compute the normalized google distance and stick that info
        on the attributes
        :default: The default value to set for NGD if it returns a nan
        :return: response
        """
        if self.response.status != 'OK':  # Catches any errors that may have been logged during initialization
            self._close_database()
            return self.response
        parameters = self.parameters
        self.response.debug(f"Computing NGD")
        self.response.info(f"Computing the normalized Google distance: weighting edges based on subject/object node "
                           f"co-occurrence frequency in PubMed abstracts")
        name = "normalized_google_distance"
        type = "EDAM:data_2526"
        value = self.parameters['default_value']
        url = "https://arax.ncats.io/api/rtx/v1/ui/#/PubmedMeshNgd"
        qg = self.message.query_graph
        kg = self.message.knowledge_graph

        # if you want to add virtual edges, identify the subject/objects, decorate the edges, add them to the KG, and then add one to the QG corresponding to them
        if 'virtual_relation_label' in parameters:
            # Figure out which node pairs to compute NGD between
            subject_qnode_key = parameters['subject_qnode_key']
            object_qnode_key = parameters['object_qnode_key']
            node_pairs_to_evaluate = ou.get_node_pairs_to_overlay(subject_qnode_key, object_qnode_key, qg, kg, self.response)
            # Grab PMID lists for all involved nodes
            involved_curies = {curie for node_pair in node_pairs_to_evaluate for curie in node_pair}
            canonicalized_curie_lookup = self._get_canonical_curies_map(list(involved_curies))
            self.load_curie_to_pmids_data(canonicalized_curie_lookup.values())
            added_flag = False  # check to see if any edges where added
            self.response.debug(f"Looping through {len(node_pairs_to_evaluate)} node pairs and calculating NGD values")
            # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute
            for (subject_curie, object_curie) in node_pairs_to_evaluate:
                # create the edge attribute if it can be
                canonical_subject_curie = canonicalized_curie_lookup.get(subject_curie, subject_curie)
                canonical_object_curie = canonicalized_curie_lookup.get(object_curie, object_curie)
                ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie)
                if np.isfinite(ngd_value):  # if ngd is finite, that's ok, otherwise, stay with default
                    value = ngd_value
                edge_attribute = EdgeAttribute(type=type, name=name, value=str(value), url=url)  # populate the NGD edge attribute
                pmid_attribute = EdgeAttribute(type="biolink:publications", name="publications", value=[f"PMID:{pmid}" for pmid in pmid_set])
                if edge_attribute:
                    added_flag = True
                    # make the edge, add the attribute

                    # edge properties
                    now = datetime.now()
                    edge_type = "biolink:has_normalized_google_distance_with"
                    qedge_keys = [parameters['virtual_relation_label']]
                    relation = parameters['virtual_relation_label']
                    is_defined_by = "ARAX"
                    defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                    provided_by = "ARAX"
                    confidence = None
                    weight = None  # TODO: could make the actual value of the attribute
                    subject_key = subject_curie
                    object_key = object_curie

                    # now actually add the virtual edges in
                    id = f"{relation}_{self.global_iter}"
                    # ensure the id is unique
                    # might need to change after expand is implemented for TRAPI 1.0
                    while id in self.message.knowledge_graph.edges:
                        id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}"
                    self.global_iter += 1
                    edge_attribute_list = [
                        edge_attribute,
                        pmid_attribute,
                        EdgeAttribute(name="is_defined_by", value=is_defined_by, type="ARAX_TYPE_PLACEHOLDER"),
                        EdgeAttribute(name="defined_datetime", value=defined_datetime, type="metatype:Datetime"),
                        EdgeAttribute(name="provided_by", value=provided_by, type="biolink:provided_by"),
                        #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"),
                        #EdgeAttribute(name="weight", value=weight, type="metatype:Float"),
                        #EdgeAttribute(name="qedge_keys", value=qedge_keys)
                    ]
                    # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key,
                    #             object_key=object_key,
                    #             is_defined_by=is_defined_by, defined_datetime=defined_datetime,
                    #             provided_by=provided_by,
                    #             confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids)
                    edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation,
                                attributes=edge_attribute_list)
                    edge.qedge_keys = qedge_keys
                    self.message.knowledge_graph.edges[id] = edge

            # Now add a q_edge the query_graph since I've added an extra edge to the KG
            if added_flag:
                #edge_type = parameters['virtual_edge_type']
                edge_type = "biolink:has_normalized_google_distance_with"
                relation = parameters['virtual_relation_label']
                option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, qg, self.response)
                # q_edge = QEdge(id=relation, type=edge_type, relation=relation,
                #                subject_key=subject_qnode_key, object_key=object_qnode_key,
                #                option_group_id=option_group_id)
                q_edge = QEdge(predicate=edge_type, relation=relation, subject=subject_qnode_key,
                           object=object_qnode_key, option_group_id=option_group_id)
                self.message.query_graph.edges[relation]=q_edge

            self.response.info(f"NGD values successfully added to edges")
        else:  # you want to add it for each edge in the KG
            # iterate over KG edges, add the information
            try:
                # Map all nodes to their canonicalized curies in one batch (need canonical IDs for the local NGD system)
                canonicalized_curie_map = self._get_canonical_curies_map([key for key in self.message.knowledge_graph.nodes.keys()])
                self.load_curie_to_pmids_data(canonicalized_curie_map.values())
                self.response.debug(f"Looping through edges and calculating NGD values")
                for edge in self.message.knowledge_graph.edges.values():
                    # Make sure the attributes are not None
                    if not edge.attributes:
                        edge.attributes = []  # should be an array, but why not a list?
                    # now go and actually get the NGD
                    subject_curie = edge.subject
                    object_curie = edge.object
                    canonical_subject_curie = canonicalized_curie_map.get(subject_curie, subject_curie)
                    canonical_object_curie = canonicalized_curie_map.get(object_curie, object_curie)
                    ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie)
                    if np.isfinite(ngd_value):  # if ngd is finite, that's ok, otherwise, stay with default
                        value = ngd_value
                    ngd_edge_attribute = EdgeAttribute(type=type, name=name, value=str(value), url=url)  # populate the NGD edge attribute
                    pmid_edge_attribute = EdgeAttribute(type="biolink:publications", name="ngd_publications", value=[f"PMID:{pmid}" for pmid in pmid_set])
                    edge.attributes.append(ngd_edge_attribute)  # append it to the list of attributes
                    edge.attributes.append(pmid_edge_attribute)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong adding the NGD edge attributes")
            else:
                self.response.info(f"NGD values successfully added to edges")
            self._close_database()
            return self.response
 def _create_icees_edge_attribute(self, p_value):
     return EdgeAttribute(name=self.icees_attribute_name,
                          value=p_value,
                          type=self.icees_attribute_type)
Пример #8
0
    def add_virtual_edge(self, name="", default=0.):
        """
        Generic function to add a virtual edge to the KG an QG
        :name: name of the functionality of the KP to use
        """
        parameters = self.parameters
        subject_curies_to_decorate = set()
        object_curies_to_decorate = set()
        curies_to_names = dict(
        )  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
        # identify the nodes that we should be adding virtual edges for
        for key, node in self.message.knowledge_graph.nodes.items():
            if hasattr(node, 'qnode_keys'):
                if parameters['subject_qnode_key'] in node.qnode_keys:
                    subject_curies_to_decorate.add(key)
                    curies_to_names[
                        key] = node.name  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
                if parameters['object_qnode_key'] in node.qnode_keys:
                    object_curies_to_decorate.add(key)
                    curies_to_names[
                        key] = node.name  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
        added_flag = False  # check to see if any edges where added
        # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute
        for (subject_curie,
             object_curie) in itertools.product(subject_curies_to_decorate,
                                                object_curies_to_decorate):
            # create the edge attribute if it can be
            edge_attribute = self.make_edge_attribute_from_curies(
                subject_curie,
                object_curie,
                subject_name=curies_to_names[subject_curie],
                object_name=curies_to_names[object_curie],
                default=default,
                name=name)
            if edge_attribute:
                added_flag = True
                # make the edge, add the attribute

                # edge properties
                now = datetime.now()
                edge_type = f"biolink:has_{name}_with"
                qedge_keys = [parameters['virtual_relation_label']]
                relation = parameters['virtual_relation_label']
                is_defined_by = "ARAX"
                defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                provided_by = "ARAX"
                confidence = None
                weight = None  # TODO: could make the actual value of the attribute
                subject_key = subject_curie
                object_key = object_curie

                # now actually add the virtual edges in
                id = f"{relation}_{self.global_iter}"
                # ensure the id is unique
                # might need to change after expand is implemented for TRAPI 1.0
                while id in self.message.knowledge_graph.edges:
                    id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}"
                self.global_iter += 1
                edge_attribute_list = [
                    edge_attribute,
                    EdgeAttribute(name="is_defined_by",
                                  value=is_defined_by,
                                  type="ARAX_TYPE_PLACEHOLDER"),
                    EdgeAttribute(name="defined_datetime",
                                  value=defined_datetime,
                                  type="metatype:Datetime"),
                    EdgeAttribute(name="provided_by",
                                  value=provided_by,
                                  type="biolink:provided_by"),
                    #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"),
                    #EdgeAttribute(name="weight", value=weight, type="metatype:Float"),
                    #EdgeAttribute(name="qedge_ids", value=qedge_ids)
                ]
                # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key,
                #             object_key=object_key,
                #             is_defined_by=is_defined_by, defined_datetime=defined_datetime,
                #             provided_by=provided_by,
                #             confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids)
                edge = Edge(predicate=edge_type,
                            subject=subject_key,
                            object=object_key,
                            relation=relation,
                            attributes=edge_attribute_list)
                edge.qedge_keys = qedge_keys
                self.message.knowledge_graph.edges[id] = edge

        # Now add a q_edge the query_graph since I've added an extra edge to the KG
        if added_flag:
            edge_type = f"biolink:has_{name}_with"
            relation = parameters['virtual_relation_label']
            qedge_keys = [parameters['virtual_relation_label']]
            subject_qnode_key = parameters['subject_qnode_key']
            object_qnode_key = parameters['object_qnode_key']
            option_group_id = ou.determine_virtual_qedge_option_group(
                subject_qnode_key, object_qnode_key, self.message.query_graph,
                self.response)
            # q_edge = QEdge(id=relation, type=edge_type, relation=relation,
            #                subject_key=subject_qnode_key, object_key=object_qnode_key,
            #                option_group_id=option_group_id)  # TODO: ok to make the id and type the same thing?
            q_edge = QEdge(predicate=edge_type,
                           relation=relation,
                           subject=subject_qnode_key,
                           object=object_qnode_key,
                           option_group_id=option_group_id)
            self.message.query_graph.edges[relation] = q_edge
Пример #9
0
    def make_edge_attribute_from_curies(self,
                                        subject_curie,
                                        object_curie,
                                        subject_name="",
                                        object_name="",
                                        default=0.,
                                        name=""):
        """
        Generic function to make an edge attribute
        :subject_curie: CURIE of the subject node for the edge under consideration
        :object_curie: CURIE of the object node for the edge under consideration
        :subject_name: text name of the subject node (in case the KP doesn't understand the CURIE)
        :object: text name of the object node (in case the KP doesn't understand the CURIE)
        :default: default value of the edge attribute
        :name: name of the KP functionality you want to apply
        """
        try:
            # edge attributes
            name = name
            type = "EDAM:data_0951"
            url = "http://cohd.smart-api.info/"
            value = default

            node_curie_to_type = self.node_curie_to_type
            subject_type = node_curie_to_type[subject_curie]
            object_type = node_curie_to_type[object_curie]
            # figure out which knowledge provider to use  # TODO: should handle this in a more structured fashion, does there exist a standardized KP API format?
            KP_to_use = None
            for KP in self.who_knows_about_what:
                # see which KP's can label both subjects of information
                if self.in_common(
                        subject_type,
                        self.who_knows_about_what[KP]) and self.in_common(
                            object_type, self.who_knows_about_what[KP]):
                    KP_to_use = KP
            if KP_to_use == 'COHD':
                self.response.debug(
                    f"Querying Columbia Open Health data for info about {subject_name} and {object_name}"
                )
                # convert CURIE to OMOP identifiers
                # subject_OMOPs = [str(x['omop_standard_concept_id']) for x in COHD.get_xref_to_OMOP(subject_curie, 1)]
                res = self.cohdIndex.get_concept_ids(subject_curie)
                if len(res) != 0:
                    subject_OMOPs = res
                else:
                    subject_OMOPs = []
                # object_OMOPs = [str(x['omop_standard_concept_id']) for x in COHD.get_xref_to_OMOP(object_curie, 1)]
                res = self.cohdIndex.get_concept_ids(object_curie)
                if len(res) != 0:
                    object_OMOPs = res
                else:
                    object_OMOPs = []
                # for domain in ["Condition", "Drug", "Procedure"]:
                #     subject_OMOPs.update([str(x['concept_id']) for x in COHD.find_concept_ids(subject_name, domain=domain, dataset_id=3)])
                #     object_OMOPs.update([str(x['concept_id']) for x in COHD.find_concept_ids(object_name, domain=domain, dataset_id=3)])
                #################################################
                # FIXME: this was the old way
                # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
                # if subject_curie.split('.')[0] == 'CHEMBL':
                #     subject_OMOPs = [str(x['concept_id']) for x in
                #                     COHD.find_concept_ids(subject_name, domain="Drug", dataset_id=3)]
                # if object_curie.split('.')[0] == 'CHEMBL':
                #     object_OMOPs = [str(x['concept_id']) for x in
                #                     COHD.find_concept_ids(object_name, domain="Drug", dataset_id=3)]

                # uniquify everything
                # subject_OMOPs = list(set(subject_OMOPs))
                # object_OMOPs = list(set(object_OMOPs))

                # Decide how to handle the response from the KP
                if name == 'paired_concept_frequency':
                    # sum up all frequencies  #TODO check with COHD people to see if this is kosher
                    frequency = default
                    # for (omop1, omop2) in itertools.product(subject_OMOPs, object_OMOPs):
                    #     freq_data_list = self.cohdIndex.get_paired_concept_freq(omop1, omop2, 3) # use the hierarchical dataset
                    #     if len(freq_data_list) != 0:
                    #         freq_data = freq_data_list[0]
                    #         temp_value = freq_data['concept_frequency']
                    #         if temp_value > frequency:
                    #             frequency = temp_value
                    omop_pairs = [
                        f"{omop1}_{omop2}"
                        for (omop1, omop2) in itertools.product(
                            subject_OMOPs, object_OMOPs)
                    ]
                    if len(omop_pairs) != 0:
                        res = self.cohdIndex.get_paired_concept_freq(
                            concept_id_pair=omop_pairs,
                            dataset_id=3)  # use the hierarchical dataset
                        if len(res) != 0:
                            maximum_concept_frequency = res[0][
                                'concept_frequency']  # the result returned from get_paired_concept_freq was sorted by decreasing order
                            frequency = maximum_concept_frequency
                    # decorate the edges
                    value = frequency

                elif name == 'observed_expected_ratio':
                    # should probably take the largest obs/exp ratio  # TODO: check with COHD people to see if this is kosher
                    # FIXME: the ln_ratio can be negative, so I should probably account for this, but the object model doesn't like -np.inf
                    value = float(
                        "-inf"
                    )  # FIXME: unclear in object model if attribute type dictates value type, or if value always needs to be a string

                    ###############################
                    # The following code was an experiment to see if it would speed things up, leaving it out for now since it's difficult to quantify if it does speed things up given the cacheing
                    #if len(subject_OMOPs) < len(object_OMOPs):
                    #    for omop1 in subject_OMOPs:
                    #        omop_to_ln_ratio = dict()
                    #        response = COHD.get_obs_exp_ratio(omop1, domain="", dataset_id=3)  # use the hierarchical dataset
                    #        if response:
                    #            for res in response:
                    #                omop_to_ln_ratio[str(res['concept_id_2'])] = res['ln_ratio']
                    #        for omop2 in object_OMOPs:
                    #            if omop2 in omop_to_ln_ratio:
                    #                temp_value = omop_to_ln_ratio[omop2]
                    #                if temp_value > value:
                    #                    value = temp_value
                    #else:
                    #    for omop1 in object_OMOPs:
                    #        omop_to_ln_ratio = dict()
                    #        response = COHD.get_obs_exp_ratio(omop1, domain="", dataset_id=3)  # use the hierarchical dataset
                    #        if response:
                    #            for res in response:
                    #                omop_to_ln_ratio[str(res['concept_id_2'])] = res['ln_ratio']
                    #        for omop2 in subject_OMOPs:
                    #            if omop2 in omop_to_ln_ratio:
                    #                temp_value = omop_to_ln_ratio[omop2]
                    #                if temp_value > value:
                    #                    value = temp_value
                    ###################################

                    # for (omop1, omop2) in itertools.product(subject_OMOPs, object_OMOPs):
                    #     #print(f"{omop1},{omop2}")
                    #     response = self.cohdIndex.get_obs_exp_ratio(omop1, concept_id_2=omop2, domain="", dataset_id=3)  # use the hierarchical dataset
                    #     # response is a list, since this function is overloaded and can omit concept_id_2, take the first element
                    #     if response and 'ln_ratio' in response[0]:
                    #         temp_val = response[0]['ln_ratio']
                    #         if temp_val > value:
                    #             value = temp_val
                    omop_pairs = [
                        f"{omop1}_{omop2}"
                        for (omop1, omop2) in itertools.product(
                            subject_OMOPs, object_OMOPs)
                    ]
                    if len(omop_pairs) != 0:
                        res = self.cohdIndex.get_obs_exp_ratio(
                            concept_id_pair=omop_pairs,
                            domain="",
                            dataset_id=3)  # use the hierarchical dataset
                        if len(res) != 0:
                            maximum_ln_ratio = res[0][
                                'ln_ratio']  # the result returned from get_paired_concept_freq was sorted by decreasing order
                            value = maximum_ln_ratio

                elif name == 'chi_square':
                    value = float("inf")
                    # for (omop1, omop2) in itertools.product(subject_OMOPs, object_OMOPs):
                    #     response = self.cohdIndex.get_chi_square(omop1, concept_id_2=omop2, domain="", dataset_id=3)  # use the hierarchical dataset
                    #     # response is a list, since this function is overloaded and can omit concept_id_2, take the first element
                    #     if response and 'p-value' in response[0]:
                    #         temp_val = response[0]['p-value']
                    #         if temp_val < value:  # looking at p=values, so lower is better
                    #             value = temp_val
                    omop_pairs = [
                        f"{omop1}_{omop2}"
                        for (omop1, omop2) in itertools.product(
                            subject_OMOPs, object_OMOPs)
                    ]
                    if len(omop_pairs) != 0:
                        res = self.cohdIndex.get_chi_square(
                            concept_id_pair=omop_pairs,
                            domain="",
                            dataset_id=3)  # use the hierarchical dataset
                        if len(res) != 0:
                            minimum_pvalue = res[0][
                                'p-value']  # the result returned from get_paired_concept_freq was sorted by decreasing order
                            value = minimum_pvalue

                # create the edge attribute
                edge_attribute = EdgeAttribute(
                    type=type, name=name, value=str(value), url=url
                )  # populate the edge attribute # FIXME: unclear in object model if attribute type dictates value type, or if value always needs to be a string
                return edge_attribute
            else:
                return None
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong when adding the edge attribute from {KP_to_use}."
            )
Пример #10
0
    def compute_ngd(self):
        """
        Iterate over all the edges in the knowledge graph, compute the normalized google distance and stick that info
        on the attributes
        :default: The default value to set for NGD if it returns a nan
        :return: response
        """
        if self.response.status != 'OK':  # Catches any errors that may have been logged during initialization
            self._close_database()
            return self.response
        parameters = self.parameters
        self.response.debug(f"Computing NGD")
        self.response.info(f"Computing the normalized Google distance: weighting edges based on subject/object node "
                           f"co-occurrence frequency in PubMed abstracts")
        name = "normalized_google_distance"
        type = "EDAM:data_2526"
        default_value = self.parameters['default_value']
        url = "https://arax.ncats.io/api/rtx/v1/ui/#/PubmedMeshNgd"
        qg = self.message.query_graph
        kg = self.message.knowledge_graph
        ngd_description = """
        Normalized google distance is a metric based on edge subject/object node co-occurrence in abstracts of all [PubMed](https://pubmed.ncbi.nlm.nih.gov/) articles. 
        The formula can be found here on [wikipedia.](https://en.wikipedia.org/wiki/Normalized_Google_distance) 
        Where in this case f(x,y) is the number of PubMed abstracts both concepts apear in, f(x)/f(y) are the number of abstracts individual concepts apear in, and N is the number of pubmed articles times the average numbver of search terms per article (27 million * 20).
        """
        
        # if you want to add virtual edges, identify the subject/objects, decorate the edges, add them to the KG, and then add one to the QG corresponding to them
        # FW: changing this so if there is a virtual relation label but no subject and object then add edges for all subject object pairs in the quesry graph.
        if 'subject_qnode_key' not in parameters and 'object_qnode_key' not in parameters and 'virtual_relation_label' in parameters:
            seen_node_pairs = set()
            qgraph_edges = copy.deepcopy(list(qg.edges.values()))
            for query_edge in qgraph_edges:
                subject_qnode_key = query_edge.subject
                object_qnode_key = query_edge.object
                if subject_qnode_key < object_qnode_key:
                    qnode_key_pair = (subject_qnode_key,object_qnode_key)
                else:
                    qnode_key_pair = (object_qnode_key,subject_qnode_key)
                # FW: check if we have already added an edge for this pair
                if qnode_key_pair in seen_node_pairs:
                    pass
                else:
                    seen_node_pairs.add(qnode_key_pair)
                    # FW: Now add the edge for this qnode pair
                    # FW NOTE: If we decide to keep these changes we should really pull this out into a method as everything after this was copy pasted from below in the 'virtual_relation_label' in parameters section
                    node_pairs_to_evaluate = ou.get_node_pairs_to_overlay(subject_qnode_key, object_qnode_key, qg, kg, self.response)
                    # Grab PMID lists for all involved nodes
                    involved_curies = {curie for node_pair in node_pairs_to_evaluate for curie in node_pair}
                    canonicalized_curie_lookup = self._get_canonical_curies_map(list(involved_curies))
                    self.load_curie_to_pmids_data(canonicalized_curie_lookup.values())
                    added_flag = False  # check to see if any edges where added
                    self.response.debug(f"Looping through {len(node_pairs_to_evaluate)} node pairs and calculating NGD values")
                    # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute
                    for (subject_curie, object_curie) in node_pairs_to_evaluate:
                        # create the edge attribute if it can be
                        canonical_subject_curie = canonicalized_curie_lookup.get(subject_curie, subject_curie)
                        canonical_object_curie = canonicalized_curie_lookup.get(object_curie, object_curie)
                        ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie)
                        if np.isfinite(ngd_value):  # if ngd is finite, that's ok, otherwise, stay with default
                            edge_value = ngd_value
                        else:
                            edge_value = default_value
                        edge_attribute = EdgeAttribute(attribute_type_id=type, original_attribute_name=name, value=str(edge_value), value_url=url, description=ngd_description)  # populate the NGD edge attribute
                        pmid_attribute = EdgeAttribute(attribute_type_id="biolink:publications", original_attribute_name="publications", value=[f"PMID:{pmid}" for pmid in pmid_set])
                        if edge_attribute:
                            added_flag = True
                            # make the edge, add the attribute

                            # edge properties
                            now = datetime.now()
                            edge_type = "biolink:has_normalized_google_distance_with"
                            qedge_keys = [parameters['virtual_relation_label']]
                            relation = parameters['virtual_relation_label']
                            is_defined_by = "ARAX"
                            defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                            provided_by = "infores:arax"
                            confidence = None
                            weight = None  # TODO: could make the actual value of the attribute
                            subject_key = subject_curie
                            object_key = object_curie

                            # now actually add the virtual edges in
                            id = f"{relation}_{self.global_iter}"
                            # ensure the id is unique
                            # might need to change after expand is implemented for TRAPI 1.0
                            while id in self.message.knowledge_graph.edges:
                                id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}"
                            self.global_iter += 1
                            edge_attribute_list = [
                                edge_attribute,
                                pmid_attribute,
                                EdgeAttribute(original_attribute_name="virtual_relation_label", value=relation, attribute_type_id="biolink:Unknown"),
                                #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"),
                                EdgeAttribute(original_attribute_name="defined_datetime", value=defined_datetime, attribute_type_id="metatype:Datetime"),
                                EdgeAttribute(original_attribute_name="provided_by", value=provided_by, attribute_type_id="biolink:aggregator_knowledge_source", attribute_source=provided_by, value_type_id="biolink:InformationResource"),
                                EdgeAttribute(original_attribute_name=None, value=True, attribute_type_id="biolink:computed_value", attribute_source="infores:arax-reasoner-ara", value_type_id="metatype:Boolean", value_url=None, description="This edge is a container for a computed value between two nodes that is not directly attachable to other edges.")
                                #EdgeAttribute(original_attribute_name="confidence", value=confidence, attribute_type_id="biolink:ConfidenceLevel"),
                                #EdgeAttribute(original_attribute_name="weight", value=weight, attribute_type_id="metatype:Float"),
                                #EdgeAttribute(original_attribute_name="qedge_keys", value=qedge_keys)
                            ]
                            # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key,
                            #             object_key=object_key,
                            #             is_defined_by=is_defined_by, defined_datetime=defined_datetime,
                            #             provided_by=provided_by,
                            #             confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids)

                            #### FIXME temporary hack by EWD
                            #edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation,
                            #            attributes=edge_attribute_list)
                            edge = Edge(predicate=edge_type, subject=subject_key, object=object_key,
                                        attributes=edge_attribute_list)
                            #edge.relation = relation
                            #### /end FIXME

                            edge.qedge_keys = qedge_keys
                            self.message.knowledge_graph.edges[id] = edge

                            #FW: check if results exist then modify them with the ngd edge
                            if self.message.results is not None and len(self.message.results) > 0:
                                ou.update_results_with_overlay_edge(subject_knode_key=subject_key, object_knode_key=object_key, kedge_key=id, message=self.message, log=self.response)

                    # Now add a q_edge the query_graph since I've added an extra edge to the KG
                    if added_flag:
                        #edge_type = parameters['virtual_edge_type']
                        edge_type = [ "biolink:has_normalized_google_distance_with" ]
                        relation = parameters['virtual_relation_label']
                        option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, qg, self.response)
                        # q_edge = QEdge(id=relation, type=edge_type, relation=relation,
                        #                subject_key=subject_qnode_key, object_key=object_qnode_key,
                        #                option_group_id=option_group_id)

                        #### FIXME by EWD. For later fixing
                        #q_edge = QEdge(predicates=edge_type, relation=relation, subject=subject_qnode_key,
                        #           object=object_qnode_key, option_group_id=option_group_id)
                        q_edge = QEdge(predicates=edge_type, subject=subject_qnode_key,
                                   object=object_qnode_key, option_group_id=option_group_id)
                        q_edge.relation = relation
                        #### end FIXME

                        self.message.query_graph.edges[relation]=q_edge


                    self.response.info(f"NGD values successfully added to edges for the qnode pair ({subject_qnode_key},{object_qnode_key})")

        elif 'virtual_relation_label' in parameters:
            # Figure out which node pairs to compute NGD between
            subject_qnode_key = parameters['subject_qnode_key']
            object_qnode_key = parameters['object_qnode_key']
            node_pairs_to_evaluate = ou.get_node_pairs_to_overlay(subject_qnode_key, object_qnode_key, qg, kg, self.response)
            # Grab PMID lists for all involved nodes
            involved_curies = {curie for node_pair in node_pairs_to_evaluate for curie in node_pair}
            canonicalized_curie_lookup = self._get_canonical_curies_map(list(involved_curies))
            self.load_curie_to_pmids_data(canonicalized_curie_lookup.values())
            added_flag = False  # check to see if any edges where added
            self.response.debug(f"Looping through {len(node_pairs_to_evaluate)} node pairs and calculating NGD values")
            # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute
            for (subject_curie, object_curie) in node_pairs_to_evaluate:
                # create the edge attribute if it can be
                canonical_subject_curie = canonicalized_curie_lookup.get(subject_curie, subject_curie)
                canonical_object_curie = canonicalized_curie_lookup.get(object_curie, object_curie)
                ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie)
                if np.isfinite(ngd_value):  # if ngd is finite, that's ok, otherwise, stay with default
                    edge_value = ngd_value
                else:
                    edge_value = default_value
                edge_attribute = EdgeAttribute(attribute_type_id=type, original_attribute_name=name, value=str(edge_value), value_url=url, description=ngd_description)  # populate the NGD edge attribute
                pmid_attribute = EdgeAttribute(attribute_type_id="biolink:publications", original_attribute_name="publications", value=[f"PMID:{pmid}" for pmid in pmid_set])
                if edge_attribute:
                    added_flag = True
                    # make the edge, add the attribute

                    # edge properties
                    now = datetime.now()
                    edge_type = "biolink:has_normalized_google_distance_with"
                    qedge_keys = [parameters['virtual_relation_label']]
                    relation = parameters['virtual_relation_label']
                    is_defined_by = "ARAX"
                    defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                    provided_by = "infores:arax"
                    confidence = None
                    weight = None  # TODO: could make the actual value of the attribute
                    subject_key = subject_curie
                    object_key = object_curie

                    # now actually add the virtual edges in
                    id = f"{relation}_{self.global_iter}"
                    # ensure the id is unique
                    # might need to change after expand is implemented for TRAPI 1.0
                    while id in self.message.knowledge_graph.edges:
                        id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}"
                    self.global_iter += 1
                    edge_attribute_list = [
                        edge_attribute,
                        pmid_attribute,
                        EdgeAttribute(original_attribute_name="virtual_relation_label", value=relation, attribute_type_id="biolink:Unknown"),
                        #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"),
                        EdgeAttribute(original_attribute_name="defined_datetime", value=defined_datetime, attribute_type_id="metatype:Datetime"),
                        EdgeAttribute(original_attribute_name="provided_by", value=provided_by, attribute_type_id="biolink:aggregator_knowledge_source", attribute_source=provided_by, value_type_id="biolink:InformationResource"),
                        EdgeAttribute(original_attribute_name=None, value=True, attribute_type_id="biolink:computed_value", attribute_source="infores:arax-reasoner-ara", value_type_id="metatype:Boolean", value_url=None, description="This edge is a container for a computed value between two nodes that is not directly attachable to other edges.")
                        #EdgeAttribute(original_attribute_name="confidence", value=confidence, attribute_type_id="biolink:ConfidenceLevel"),
                        #EdgeAttribute(original_attribute_name="weight", value=weight, attribute_type_id="metatype:Float"),
                        #EdgeAttribute(original_attribute_name="qedge_keys", value=qedge_keys)
                    ]
                    # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key,
                    #             object_key=object_key,
                    #             is_defined_by=is_defined_by, defined_datetime=defined_datetime,
                    #             provided_by=provided_by,
                    #             confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids)

                    #### FIXME temporary hack by EWD
                    #edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation,
                    #            attributes=edge_attribute_list)
                    edge = Edge(predicate=edge_type, subject=subject_key, object=object_key,
                                attributes=edge_attribute_list)
                    #edge.relation = relation
                    #### /end FIXME

                    edge.qedge_keys = qedge_keys
                    self.message.knowledge_graph.edges[id] = edge

                    #FW: check if results exist then modify them with the ngd edge
                    if self.message.results is not None and len(self.message.results) > 0:
                        ou.update_results_with_overlay_edge(subject_knode_key=subject_key, object_knode_key=object_key, kedge_key=id, message=self.message, log=self.response)

            # Now add a q_edge the query_graph since I've added an extra edge to the KG
            if added_flag:
                #edge_type = parameters['virtual_edge_type']
                edge_type = [ "biolink:has_normalized_google_distance_with" ]
                relation = parameters['virtual_relation_label']
                option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, qg, self.response)
                # q_edge = QEdge(id=relation, type=edge_type, relation=relation,
                #                subject_key=subject_qnode_key, object_key=object_qnode_key,
                #                option_group_id=option_group_id)

                #### FIXME by EWD. For later fixing
                #q_edge = QEdge(predicates=edge_type, relation=relation, subject=subject_qnode_key,
                #           object=object_qnode_key, option_group_id=option_group_id)
                q_edge = QEdge(predicates=edge_type, subject=subject_qnode_key,
                           object=object_qnode_key, option_group_id=option_group_id)
                q_edge.relation = relation
                #### end FIXME

                self.message.query_graph.edges[relation]=q_edge


            self.response.info(f"NGD values successfully added to edges")
        else:  # you want to add it for each edge in the KG
            # iterate over KG edges, add the information
            try:
                # Map all nodes to their canonicalized curies in one batch (need canonical IDs for the local NGD system)
                canonicalized_curie_map = self._get_canonical_curies_map([key for key in self.message.knowledge_graph.nodes.keys()])
                self.load_curie_to_pmids_data(canonicalized_curie_map.values())
                self.response.debug(f"Looping through edges and calculating NGD values")
                for edge in self.message.knowledge_graph.edges.values():
                    # Make sure the attributes are not None
                    if not edge.attributes:
                        edge.attributes = []  # should be an array, but why not a list?
                    # now go and actually get the NGD
                    subject_curie = edge.subject
                    object_curie = edge.object
                    canonical_subject_curie = canonicalized_curie_map.get(subject_curie, subject_curie)
                    canonical_object_curie = canonicalized_curie_map.get(object_curie, object_curie)
                    ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie)
                    if np.isfinite(ngd_value):  # if ngd is finite, that's ok, otherwise, stay with default
                        edge_value = ngd_value
                    else:
                        edge_value = default_value
                    ngd_edge_attribute = EdgeAttribute(attribute_type_id=type, original_attribute_name=name, value=str(edge_value), value_url=url, description=ngd_description)  # populate the NGD edge attribute
                    pmid_edge_attribute = EdgeAttribute(attribute_type_id="biolink:publications", original_attribute_name="ngd_publications", value_type_id="EDAM:data_1187", value=[f"PMID:{pmid}" for pmid in pmid_set])
                    edge.attributes.append(ngd_edge_attribute)  # append it to the list of attributes
                    edge.attributes.append(pmid_edge_attribute)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong adding the NGD edge attributes")
            else:
                self.response.info(f"NGD values successfully added to edges")
            self._close_database()
        return self.response
Пример #11
0
 def _create_icees_edge_attribute(self, p_value):
     return EdgeAttribute(original_attribute_name=self.icees_attribute_name,
                          value=p_value,
                          attribute_type_id=self.icees_attribute_type)
Пример #12
0
    def add_virtual_edge(self, name="", default=0.):
        """
        Generic function to add a virtual edge to the KG an QG
        :name: name of the functionality of the KP to use
        """
        parameters = self.parameters
        subject_curies_to_decorate = set()
        object_curies_to_decorate = set()
        curies_to_names = dict(
        )  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
        # identify the nodes that we should be adding virtual edges for
        for key, node in self.message.knowledge_graph.nodes.items():
            if hasattr(node, 'qnode_keys'):
                if parameters['subject_qnode_key'] in node.qnode_keys:
                    subject_curies_to_decorate.add(key)
                    curies_to_names[
                        key] = node.name  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
                if parameters['object_qnode_key'] in node.qnode_keys:
                    object_curies_to_decorate.add(key)
                    curies_to_names[
                        key] = node.name  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
        added_flag = False  # check to see if any edges where added
        # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute

        ## call COHD api one time to save time
        curies_to_decorate = set()
        curies_to_decorate.update(subject_curies_to_decorate)
        curies_to_decorate.update(object_curies_to_decorate)
        self.mapping_curie_to_omop_ids = self.cohdIndex.get_concept_ids(
            curies_to_decorate)
        for (subject_curie,
             object_curie) in itertools.product(subject_curies_to_decorate,
                                                object_curies_to_decorate):
            # create the edge attribute if it can be
            edge_attribute = self.make_edge_attribute_from_curies(
                subject_curie,
                object_curie,
                subject_name=curies_to_names[subject_curie],
                object_name=curies_to_names[object_curie],
                default=default,
                name=name)
            if edge_attribute:
                added_flag = True
                # make the edge, add the attribute

                # edge properties
                now = datetime.now()
                edge_type = f"biolink:has_real_world_evidence_of_association_with"
                qedge_keys = [parameters['virtual_relation_label']]
                relation = parameters['virtual_relation_label']
                is_defined_by = "ARAX"
                defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                provided_by = "infores:arax"
                confidence = None
                weight = None  # TODO: could make the actual value of the attribute
                subject_key = subject_curie
                object_key = object_curie

                # now actually add the virtual edges in
                id = f"{relation}_{self.global_iter}"
                # ensure the id is unique
                # might need to change after expand is implemented for TRAPI 1.0
                while id in self.message.knowledge_graph.edges:
                    id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}"
                self.global_iter += 1
                edge_attribute_list = [
                    edge_attribute,
                    EdgeAttribute(
                        original_attribute_name="virtual_relation_label",
                        value=relation,
                        attribute_type_id="biolink:Unknown"),
                    #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"),
                    EdgeAttribute(original_attribute_name="defined_datetime",
                                  value=defined_datetime,
                                  attribute_type_id="metatype:Datetime"),
                    EdgeAttribute(
                        original_attribute_name="provided_by",
                        value=provided_by,
                        attribute_type_id="biolink:aggregator_knowledge_source",
                        attribute_source=provided_by,
                        value_type_id="biolink:InformationResource"),
                    EdgeAttribute(
                        original_attribute_name=None,
                        value=True,
                        attribute_type_id="biolink:computed_value",
                        attribute_source="infores:arax-reasoner-ara",
                        value_type_id="metatype:Boolean",
                        value_url=None,
                        description=
                        "This edge is a container for a computed value between two nodes that is not directly attachable to other edges."
                    )
                    #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"),
                    #EdgeAttribute(name="weight", value=weight, type="metatype:Float"),
                    #EdgeAttribute(name="qedge_ids", value=qedge_ids)
                ]
                # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key,
                #             object_key=object_key,
                #             is_defined_by=is_defined_by, defined_datetime=defined_datetime,
                #             provided_by=provided_by,
                #             confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids)
                edge = Edge(predicate=edge_type,
                            subject=subject_key,
                            object=object_key,
                            attributes=edge_attribute_list)
                edge.qedge_keys = qedge_keys
                self.message.knowledge_graph.edges[id] = edge
                if self.message.results is not None and len(
                        self.message.results) > 0:
                    ou.update_results_with_overlay_edge(
                        subject_knode_key=subject_key,
                        object_knode_key=object_key,
                        kedge_key=id,
                        message=self.message,
                        log=self.response)

        # Now add a q_edge the query_graph since I've added an extra edge to the KG
        if added_flag:
            edge_type = f"biolink:has_real_world_evidence_of_association_with"
            relation = parameters['virtual_relation_label']
            qedge_keys = [parameters['virtual_relation_label']]
            subject_qnode_key = parameters['subject_qnode_key']
            object_qnode_key = parameters['object_qnode_key']
            option_group_id = ou.determine_virtual_qedge_option_group(
                subject_qnode_key, object_qnode_key, self.message.query_graph,
                self.response)
            # q_edge = QEdge(id=relation, type=edge_type, relation=relation,
            #                subject_key=subject_qnode_key, object_key=object_qnode_key,
            #                option_group_id=option_group_id)  # TODO: ok to make the id and type the same thing?
            q_edge = QEdge(predicates=edge_type,
                           subject=subject_qnode_key,
                           object=object_qnode_key,
                           option_group_id=option_group_id)
            q_edge.relation = relation
            self.message.query_graph.edges[relation] = q_edge
Пример #13
0
    def fisher_exact_test(self):
        """
        Peform the fisher's exact test to expand or decorate the knowledge graph
        :return: response
        """

        self.response.info(f"Performing Fisher's Exact Test to add p-value to edge attribute of virtual edge")

        # check the input parameters
        if 'subject_qnode_key' not in self.parameters:
            self.response.error(f"The argument 'subject_qnode_key' is required for fisher_exact_test function")
            return self.response
        else:
            subject_qnode_key = self.parameters['subject_qnode_key']
        if 'virtual_relation_label' not in self.parameters:
            self.response.error(f"The argument 'virtual_relation_label' is required for fisher_exact_test function")
            return self.response
        else:
            virtual_relation_label = str(self.parameters['virtual_relation_label'])
        if 'object_qnode_key' not in self.parameters:
            self.response.error(f"The argument 'object_qnode_key' is required for fisher_exact_test function")
            return self.response
        else:
            object_qnode_key = self.parameters['object_qnode_key']
        rel_edge_key = self.parameters['rel_edge_key'] if 'rel_edge_key' in self.parameters else None
        top_n = int(self.parameters['top_n']) if 'top_n' in self.parameters else None
        cutoff = float(self.parameters['cutoff']) if 'cutoff' in self.parameters else None

        ## check if the new model files exists in /predictor/retrain_data. If not, scp it from arax.ncats.io
        pathlist = os.path.realpath(__file__).split(os.path.sep)
        RTXindex = pathlist.index("RTX")
        filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources', 'KG2c'])

        ## check if there is kg2c.sqlite
        sqlite_name = RTXConfig.kg2c_sqlite_path.split("/")[-1]
        sqlite_file_path = f"{filepath}{os.path.sep}{sqlite_name}"
        if os.path.exists(sqlite_file_path):
            pass
        else:
            os.system(f"scp {RTXConfig.kg2c_sqlite_username}@{RTXConfig.kg2c_sqlite_host}:{RTXConfig.kg2c_sqlite_path} {sqlite_file_path}")
        self.sqlite_file_path = sqlite_file_path

        if rel_edge_key is not None:
            self.response.warning(f"The 'rel_edge_key' option in FET is specified, it will cause slow for the calculation of FEST test.")

        # initialize some variables
        nodes_info = {}
        edge_expand_kp = []
        subject_node_list = []
        object_node_dict = {}
        size_of_object = {}
        subject_node_exist = False
        object_node_exist = False
        query_edge_key = set()
        rel_edge_type = set()
        subject_node_category = None
        object_node_category = None

        ## Check if subject_qnode_key and object_qnode_key are in the Query Graph
        try:
            if len(self.message.query_graph.nodes) != 0:
                for node_key in self.message.query_graph.nodes:
                    if node_key == subject_qnode_key:
                        subject_node_exist = True
                        subject_node_category = self.message.query_graph.nodes[node_key].categories
                    elif node_key == object_qnode_key:
                        object_node_exist = True
                        object_node_category = self.message.query_graph.nodes[node_key].categories
                    else:
                        pass
            else:
                self.response.error(f"There is no query node in QG")
                return self.response
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(f"Something went wrong with retrieving nodes in message QG")
            return self.response

        if subject_node_exist:
            if object_node_exist:
                pass
            else:
                self.response.error(f"No query node with object qnode key {object_qnode_key} detected in QG for Fisher's Exact Test")
                return self.response
        else:
            self.response.error(f"No query node with subject qnode key {subject_qnode_key} detected in QG for Fisher's Exact Test")
            return self.response

        ## Check if there is a query edge connected to both subject_qnode_key and object_qnode_key in the Query Graph
        try:
            if len(self.message.query_graph.edges) != 0:
                for edge_key in self.message.query_graph.edges:
                    qedge_relation = None
                    if hasattr(self.message.query_graph.edges[edge_key], "relation"):
                        qedge_relation = self.message.query_graph.edges[edge_key].relation
                    if self.message.query_graph.edges[edge_key].subject == subject_qnode_key and self.message.query_graph.edges[edge_key].object == object_qnode_key and qedge_relation == None:
                        query_edge_key.update([edge_key])  # only actual query edge is added
                    elif self.message.query_graph.edges[edge_key].subject == object_qnode_key and self.message.query_graph.edges[edge_key].object == subject_qnode_key and qedge_relation == None:
                        query_edge_key.update([edge_key])  # only actual query edge is added
                    else:
                        continue
            else:
                self.response.error(f"There is no query edge in Query Graph")
                return self.response
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(f"Something went wrong with retrieving edges in message QG")
            return self.response

        if len(query_edge_key)!=0:
            if rel_edge_key:
                if rel_edge_key in query_edge_key:
                    pass
                else:
                    self.response.error(f"No query edge with qedge key {rel_edge_key} connected to both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key} detected in QG for Fisher's Exact Test")
                    return self.response
            else:
                pass
        else:
            self.response.error(
                f"No query edge connected to both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key} detected in QG for Fisher's Exact Test")
            return self.response

        ## loop over all nodes in KG and collect their node information
        try:
            for node_key, node in self.message.knowledge_graph.nodes.items():
                nodes_info[node_key] = {'qnode_keys': node.qnode_keys, 'category': self.message.knowledge_graph.nodes[node_key].categories[0]}
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(f"Something went wrong with retrieving nodes in message KG")
            return self.response

        ## loop over all edges in KG and create subject node list and target node dict based on subject_qnode_key, object_qnode_key as well as rel_edge_id (optional, otherwise all edges are considered)
        try:
            for edge_key, edge in self.message.knowledge_graph.edges.items():

                edge_attribute_list = [x.value for x in self.message.knowledge_graph.edges[edge_key].attributes if x.original_attribute_name == 'is_defined_by']
                if len(edge_attribute_list) == 0:

                    ## Collect all knowldge source information for each edge between queried qnode_keys (eg. 'n01', 'n02')
                    temp_kp = []
                    for x in self.message.knowledge_graph.edges[edge_key].attributes:
                        if x.attribute_type_id == 'biolink:aggregator_knowledge_source' or x.attribute_type_id == 'biolink:knowledge_source':
                            temp_kp += self._change_kp_name(x.value)
                    if 'arax' in temp_kp:
                        temp_kp.remove('arax')

                    if rel_edge_key:
                        if rel_edge_key in edge.qedge_keys:
                            if subject_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].subject]['qnode_keys']:
                                edge_expand_kp.extend(temp_kp)
                                rel_edge_type.update([self.message.knowledge_graph.edges[edge_key].predicate])
                                subject_node_list.append(self.message.knowledge_graph.edges[edge_key].subject)
                                if self.message.knowledge_graph.edges[edge_key].object not in object_node_dict.keys():
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].object] = {self.message.knowledge_graph.edges[edge_key].subject}
                                else:
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].object].update([self.message.knowledge_graph.edges[edge_key].subject])
                            else:
                                edge_expand_kp.extend(temp_kp)
                                rel_edge_type.update([self.message.knowledge_graph.edges[edge_key].predicate])
                                subject_node_list.append(self.message.knowledge_graph.edges[edge_key].object)
                                if self.message.knowledge_graph.edges[edge_key].subject not in object_node_dict.keys():
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].subject] = {self.message.knowledge_graph.edges[edge_key].object}
                                else:
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].subject].update([self.message.knowledge_graph.edges[edge_key].object])
                    else:
                        if subject_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].subject]['qnode_keys']:
                            if object_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].object]['qnode_keys']:
                                edge_expand_kp.extend(temp_kp)
                                subject_node_list.append(self.message.knowledge_graph.edges[edge_key].subject)
                                if self.message.knowledge_graph.edges[edge_key].object not in object_node_dict.keys():
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].object] = {self.message.knowledge_graph.edges[edge_key].subject}
                                else:
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].object].update([self.message.knowledge_graph.edges[edge_key].subject])

                        elif object_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].subject]['qnode_keys']:
                            if subject_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].object]['qnode_keys']:
                                edge_expand_kp.extend(temp_kp)
                                subject_node_list.append(self.message.knowledge_graph.edges[edge_key].object)
                                if self.message.knowledge_graph.edges[edge_key].subject not in object_node_dict.keys():
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].subject] = {self.message.knowledge_graph.edges[edge_key].object}
                                else:
                                    object_node_dict[self.message.knowledge_graph.edges[edge_key].subject].update([self.message.knowledge_graph.edges[edge_key].object])

        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(f"Something went wrong with retrieving edges in message KG")
            return self.response

        subject_node_list = list(set(subject_node_list)) ## remove the duplicate subject node key

        ## check if there is no subject node in message KG
        if len(subject_node_list) == 0:
            self.response.error(f"No subject node found in message KG for Fisher's Exact Test")
            return self.response

        ## check if there is no object node in message KG
        if len(object_node_dict) == 0:
            self.response.error(f"No object node found in message KG for Fisher's Exact Test")
            return self.response

        ## check if subject node has more than one type. If so, throw an error
        if subject_node_category is None:
            self.response.error(f"Subject node with qnode key {subject_qnode_key} was set to None in Query Graph. Please specify the node type")
            return self.response

        ## check if object node has more than one type. If so, throw an error
        if object_node_category is None:
            self.response.error(f"Object node with qnode key {object_qnode_key} was set to None in Query Graph. Please specify the node type")
            return self.response
        else:
            pass

        ##check how many kps were used in message KG. If more than one, the one with the max number of edges connnected to both subject nodes and object nodes was used
        if len(collections.Counter(edge_expand_kp))==1:
            kp = edge_expand_kp[0]
        else:
            occurrences = collections.Counter(edge_expand_kp)
            max_index = max([(value, index) for index, value in enumerate(occurrences.values())])[1] # if there are more than one kp having the maximum number of edges, then the last one based on alphabetical order will be chosen.
            kp = list(occurrences.keys())[max_index]
            self.response.debug(f"{occurrences}")
            self.response.warning(f"More than one knowledge provider were detected to be used for expanding the edges connected to both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key}")
            self.response.warning(f"The knowledge provider {kp} was used to calculate Fisher's exact test because it has the maximum number of edges connected to both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key}")

        ## check if kp is "ARAX/KG1" or "infores:rtx-kg2", if not, report error
        if kp == "rtx_kg1_kp":
            kp = 'ARAX/KG1'
        elif kp == "rtx-kg2":
            kp = 'infores:rtx-kg2'
        else:
            kp = 'infores:rtx-kg2'
            self.response.warning(f"There is more than one knowledge source for the edges between the subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key} and most of them are from {kp}. The infores:rtx-kg2 is still used to calculate Fisher's exact test.")

        if kp == 'ARAX/KG1':
            ## This warning can be removed once KG1 is deprecated
            self.response.warning(f"Since KG1 will be deprecated soon and the total count of nodes is based on kg2c, currently querying with 'expand(kp=ARAX/KG1)' might cause little discrepancy for FET probability.")

        ## Print out some information used to calculate FET
        if len(subject_node_list) == 1:
            self.response.debug(f"{len(subject_node_list)} subject node with qnode key {subject_qnode_key} and node type {subject_node_category[0]} was found in message KG and used to calculate Fisher's Exact Test")
        else:
            self.response.debug(f"{len(subject_node_list)} subject nodes with qnode key {subject_qnode_key} and node type {subject_node_category[0]} was found in message KG and used to calculate Fisher's Exact Test")
        if len(object_node_dict) == 1:
            self.response.debug(f"{len(object_node_dict)} object node with qnode key {object_qnode_key} and node type {object_node_category[0]} was found in message KG and used to calculate Fisher's Exact Test")
        else:
            self.response.debug(f"{len(object_node_dict)} object nodes with qnode key {object_qnode_key} and node type {object_node_category[0]} was found in message KG and used to calculate Fisher's Exact Test")


        # find all nodes with the same type of 'subject_qnode_key' nodes in specified KP ('ARAX/KG1','infores:rtx-kg2') that are adjacent to target nodes
        # if rel_edge_key is not None, query adjacent node from database otherwise query adjacent node with DSL command by providing a list of query nodes to add_qnode()
        ## Note: Regarding of whether kp='ARAX/KG1' or kp='infores:rtx-kg2', it will always query adjacent node count based on kg2c
        if rel_edge_key:
            if len(rel_edge_type) == 1:  # if the edge with rel_edge_key has only type, we use this rel_edge_predicate to find all subject nodes in KP
                self.response.debug(f"{kp} and edge relation type {list(rel_edge_type)[0]} were used to calculate total object nodes in Fisher's Exact Test")
                result = self.query_size_of_adjacent_nodes(node_curie=list(object_node_dict.keys()), source_type=object_node_category[0], adjacent_type=subject_node_category[0], kp=kp, rel_type=list(rel_edge_type)[0])
            else:  # if the edge with rel_edge_key has more than one type or no edge, we ignore the edge predicate and use all categories to find all subject nodes in KP
                if len(rel_edge_key) == 0:
                    self.response.warning(f"The edges with specified qedge key {rel_edge_key} have no category, we ignore the edge predicate and use all categories to calculate Fisher's Exact Test")
                else:
                    self.response.warning(f"The edges with specified qedge key {rel_edge_key} have more than one category, we ignore the edge predicate and use all categories to calculate Fisher's Exact Test")
                self.response.debug(f"infores:rtx-kg2 was used to calculate total object nodes in Fisher's Exact Test")
                result = self.query_size_of_adjacent_nodes(node_curie=list(object_node_dict.keys()), source_type=object_node_category[0], adjacent_type=subject_node_category[0], kp='infores:rtx-kg2', rel_type=None)
        else:  # if no rel_edge_key is specified, we ignore the edge predicate and use all categories to find all subject nodes in KP
            self.response.debug(f"infores:rtx-kg2 was used to calculate total object nodes in Fisher's Exact Test")
            result = self.query_size_of_adjacent_nodes(node_curie=list(object_node_dict.keys()), source_type=object_node_category[0], adjacent_type=subject_node_category[0], kp='infores:rtx-kg2', rel_type=None)

        if result is None:
            return self.response  ## Something wrong happened for querying the adjacent nodes
        else:
            res, removed_nodes = result
            if len(removed_nodes)==0:
                size_of_object = res
            else:
                if len(removed_nodes) == 1:
                    self.response.warning(f"One object node which is {removed_nodes[0]} can't find its neighbors. This node will be ignored for FET calculation.")
                else:
                    self.response.warning(f"{len(removed_nodes)} object nodes which are {removed_nodes} can't find its neighbors. These nodes will be ignored for FET calculation.")
                for node in removed_nodes:
                    del object_node_dict[node]
                size_of_object = res

        if len(object_node_dict) != 0:
            ## Based on KP detected in message KG, find the total count of node with the same type of source node
            ## Note: Regardless of whether kg='KG1' or kg='KG2' is specified in self.size_of_given_type_in_KP, it will always query total count based on kg2c
            if kp=='ARAX/KG1' or kp=='infores:rtx-kg2':
                size_of_total = self.size_of_given_type_in_KP(node_type=subject_node_category[0])
                self.response.debug(f"Total {size_of_total} unique concepts with node category {subject_node_category[0]} was found in KG2c based on 'nodesynonymizer.get_total_entity_count' and this number will be used for Fisher's Exact Test")
            else:
                self.response.error(f"Only KG1 or KG2 is allowable to calculate the Fisher's exact test temporally")
                return self.response

            size_of_query_sample = len(subject_node_list)

            self.response.debug(f"Computing Fisher's Exact Test P-value")
            # calculate FET p-value for each target node in parallel

            parameter_list = []
            del_list = []
            for node in object_node_dict:
                temp = [len(object_node_dict[node]), size_of_object[node]-len(object_node_dict[node]), size_of_query_sample - len(object_node_dict[node]), (size_of_total - size_of_object[node]) - (size_of_query_sample - len(object_node_dict[node]))]
                if any([value < 0 for value in temp]) is True:
                    del_list.append(node)
                    self.response.warning(f"Skipping node {node} to calculate FET p-value due to issue1438 (which causes negative value).")

            for del_node in del_list:
                del object_node_dict[del_node]
            parameter_list = [(node, len(object_node_dict[node]), size_of_object[node]-len(object_node_dict[node]), size_of_query_sample - len(object_node_dict[node]), (size_of_total - size_of_object[node]) - (size_of_query_sample - len(object_node_dict[node]))) for node in object_node_dict]

            try:
                # with multiprocessing.Pool() as executor:
                #     FETpvalue_list = [elem for elem in executor.map(self._calculate_FET_pvalue_parallel, parameter_list)]
                FETpvalue_list = [elem for elem in map(self._calculate_FET_pvalue_parallel, parameter_list)]
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong with computing Fisher's Exact Test P-value")
                return self.response

            if any([type(elem) is list for elem in FETpvalue_list]):
                for msg in [elem2 for elem1 in FETpvalue_list if type(elem1) is list for elem2 in elem1]:
                    if type(msg) is tuple:
                        self.response.error(msg[0], error_code=msg[1])
                    else:
                        self.response.error(msg)
                return self.response
            else:
                output = dict(FETpvalue_list)

            # check if the results need to be filtered
            output = dict(sorted(output.items(), key=lambda x: x[1]))
            if cutoff:
                output = dict(filter(lambda x: x[1] < cutoff, output.items()))
            else:
                pass
            if top_n:
                output = dict(list(output.items())[:top_n])
            else:
                pass

            # add the virtual edge with FET result to message KG
            self.response.debug(f"Adding virtual edge with FET result to message KG")
            count = 0
            for index, value in enumerate([(virtual_relation_label, output[adj], node, adj) for adj in object_node_dict if adj in output.keys() for node in object_node_dict[adj]], 1):

                edge_attribute_list =  [
                    EdgeAttribute(attribute_type_id="EDAM:data_1669", original_attribute_name="fisher_exact_test_p-value", value=str(value[1]), value_url=None),
                    EdgeAttribute(original_attribute_name="virtual_relation_label", value=value[0], attribute_type_id="biolink:Unknown"),
                    #EdgeAttribute(original_attribute_name="is_defined_by", value="ARAX", attribute_type_id="biolink:Unknown"),
                    EdgeAttribute(original_attribute_name="defined_datetime", value=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), attribute_type_id="metatype:Datetime"),
                    EdgeAttribute(original_attribute_name="provided_by", value="infores:arax", attribute_type_id="biolink:aggregator_knowledge_source", attribute_source="infores:arax", value_type_id="biolink:InformationResource"),
                    EdgeAttribute(original_attribute_name=None, value=True, attribute_type_id="biolink:computed_value", attribute_source="infores:arax-reasoner-ara", value_type_id="metatype:Boolean", value_url=None, description="This edge is a container for a computed value between two nodes that is not directly attachable to other edges.")
                    #EdgeAttribute(original_attribute_name="confidence", value=None, type="biolink:ConfidenceLevel"),
                    #EdgeAttribute(original_attribute_name="weight", value=None, type="metatype:Float")
                ]
                edge_id = f"{value[0]}_{index}"
                edge = Edge(predicate='biolink:has_fisher_exact_test_p_value_with', subject=value[2], object=value[3],
                            attributes=edge_attribute_list)
                edge.qedge_keys = [value[0]]

                self.message.knowledge_graph.edges[edge_id] = edge

                if self.message.results is not None and len(self.message.results) > 0:
                    ou.update_results_with_overlay_edge(subject_knode_key=value[2], object_knode_key=value[3], kedge_key=edge_id, message=self.message, log=self.response)

                count = count + 1

            self.response.debug(f"{count} new virtual edges were added to message KG")

            # add the virtual edge to message QG
            if count > 0:
                self.response.debug(f"Adding virtual edge to message QG")
                edge_type = ["biolink:has_fisher_exact_test_p_value_with"]
                option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key,
                                                                          self.message.query_graph, self.response)
                qedge_id = virtual_relation_label
                q_edge = QEdge(predicates=edge_type,
                               subject=subject_qnode_key, object=object_qnode_key,
                               option_group_id=option_group_id)
                q_edge.relation = virtual_relation_label
                self.message.query_graph.edges[qedge_id] = q_edge
                self.response.debug(f"One virtual edge was added to message QG")

        return self.response