Exemplo n.º 1
0
 def _create_ngd_edge(self, ngd_value: float, subject: str, object: str,
                      pmid_list: list) -> Tuple[str, Edge]:
     ngd_edge = Edge()
     ngd_edge.predicate = self.ngd_edge_type
     ngd_edge.subject = subject
     ngd_edge.object = object
     ngd_edge_key = f"NGD:{subject}--{ngd_edge.predicate}--{object}"
     ngd_edge.attributes = [
         Attribute(name=self.ngd_edge_attribute_name,
                   type=self.ngd_edge_attribute_type,
                   value=ngd_value,
                   url=self.ngd_edge_attribute_url)
     ]
     ngd_edge.attributes += [
         Attribute(name="provided_by",
                   value="ARAX",
                   type=eu.get_attribute_type("provided_by")),
         Attribute(name="is_defined_by",
                   value="ARAX",
                   type=eu.get_attribute_type("is_defined_by")),
         Attribute(name="publications",
                   value=pmid_list,
                   type=eu.get_attribute_type("publications"))
     ]
     return ngd_edge_key, ngd_edge
Exemplo n.º 2
0
    def _add_answers_to_kg(self, answer_kg: QGOrganizedKnowledgeGraph, reasoner_std_response: Dict[str, any],
                           input_qnode_key: str, output_qnode_key: str, qedge_key: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict(reasoner_std_response['results'])
        if reasoner_std_response['knowledge_graph']['edges']:
            remapped_node_keys = dict()
            log.debug(f"Got results back from BTE for this query "
                      f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)")

            for node in reasoner_std_response['knowledge_graph']['nodes']:
                swagger_node = Node()
                bte_node_key = node.get('id')
                swagger_node.name = node.get('name')
                swagger_node.category = eu.convert_to_list(eu.convert_string_to_snake_case(node.get('type')))

                # Map the returned BTE qg_ids back to the original qnode_keys in our query graph
                bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_key)
                if bte_qg_id == "n0":
                    qnode_key = input_qnode_key
                elif bte_qg_id == "n1":
                    qnode_key = output_qnode_key
                else:
                    log.error("Could not map BTE qg_id to ARAX qnode_key", error_code="UnknownQGID")
                    return answer_kg

                # Find and use the preferred equivalent identifier for this node (if it's an output node)
                if qnode_key == output_qnode_key:
                    if bte_node_key in remapped_node_keys:
                        swagger_node_key = remapped_node_keys.get(bte_node_key)
                    else:
                        equivalent_curies = [f"{prefix}:{eu.get_curie_local_id(local_id)}" for prefix, local_ids in
                                             node.get('equivalent_identifiers').items() for local_id in local_ids]
                        swagger_node_key = self._get_best_equivalent_bte_curie(equivalent_curies, swagger_node.category[0])
                        remapped_node_keys[bte_node_key] = swagger_node_key
                else:
                    swagger_node_key = bte_node_key

                answer_kg.add_node(swagger_node_key, swagger_node, qnode_key)

            for edge in reasoner_std_response['knowledge_graph']['edges']:
                swagger_edge = Edge()
                swagger_edge_key = edge.get("id")
                swagger_edge.predicate = edge.get('type')
                swagger_edge.subject = remapped_node_keys.get(edge.get('source_id'), edge.get('source_id'))
                swagger_edge.object = remapped_node_keys.get(edge.get('target_id'), edge.get('target_id'))
                swagger_edge.attributes = [Attribute(name="provided_by", value=edge.get('edge_source'), type=eu.get_attribute_type("provided_by")),
                                           Attribute(name="is_defined_by", value="BTE", type=eu.get_attribute_type("is_defined_by"))]
                # Map the returned BTE qg_id back to the original qedge_key in our query graph
                bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge_key)
                if bte_qg_id != "e1":
                    log.error("Could not map BTE qg_id to ARAX qedge_key", error_code="UnknownQGID")
                    return answer_kg
                answer_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key)

        return answer_kg
Exemplo n.º 3
0
 def _create_swagger_edge_from_kp_edge(self, kp_edge_key: str,
                                       kp_edge: Dict[str, any]) -> Edge:
     swagger_edge = Edge(subject=kp_edge['subject'],
                         object=kp_edge['object'],
                         predicate=kp_edge['predicate'])
     swagger_edge.attributes = [
         Attribute(name="provided_by",
                   value=self.kp_name,
                   type=eu.get_attribute_type("provided_by")),
         Attribute(name="is_defined_by",
                   value="ARAX",
                   type=eu.get_attribute_type("is_defined_by"))
     ]
     return kp_edge_key, swagger_edge
Exemplo n.º 4
0
 def _convert_kg2c_edge_to_swagger_edge(
         self, neo4j_edge: Dict[str, any]) -> Tuple[str, Edge]:
     swagger_edge = Edge()
     swagger_edge_key = f"KG2c:{neo4j_edge.get('id')}"
     swagger_edge.predicate = neo4j_edge.get("simplified_edge_label")
     swagger_edge.subject = neo4j_edge.get("subject")
     swagger_edge.object = neo4j_edge.get("object")
     other_properties = ["provided_by", "publications"]
     swagger_edge.attributes = self._create_swagger_attributes(
         other_properties, neo4j_edge)
     is_defined_by_attribute = Attribute(
         name="is_defined_by",
         value="ARAX/KG2c",
         type=eu.get_attribute_type("is_defined_by"))
     swagger_edge.attributes.append(is_defined_by_attribute)
     return swagger_edge_key, swagger_edge
Exemplo n.º 5
0
 def _convert_kg1_edge_to_swagger_edge(
         self, neo4j_edge: Dict[str, any],
         node_uuid_to_curie_dict: Dict[str, str]) -> Tuple[str, Edge]:
     swagger_edge = Edge()
     swagger_edge_key = f"KG1:{neo4j_edge.get('id')}"
     swagger_edge.predicate = neo4j_edge.get("predicate")
     swagger_edge.subject = node_uuid_to_curie_dict[neo4j_edge.get(
         "source_node_uuid")]
     swagger_edge.object = node_uuid_to_curie_dict[neo4j_edge.get(
         "target_node_uuid")]
     swagger_edge.relation = neo4j_edge.get("relation")
     other_properties = ["provided_by", "probability"]
     swagger_edge.attributes = self._create_swagger_attributes(
         other_properties, neo4j_edge)
     is_defined_by_attribute = Attribute(
         name="is_defined_by",
         value="ARAX/KG1",
         type=eu.get_attribute_type("is_defined_by"))
     swagger_edge.attributes.append(is_defined_by_attribute)
     return swagger_edge_key, swagger_edge
Exemplo n.º 6
0
 def _convert_kg2_edge_to_swagger_edge(self, neo4j_edge: Dict[str,
                                                              any]) -> Edge:
     swagger_edge = Edge()
     swagger_edge_key = f"KG2:{neo4j_edge.get('id')}"
     swagger_edge.predicate = neo4j_edge.get("simplified_edge_label")
     swagger_edge.subject = neo4j_edge.get("subject")
     swagger_edge.object = neo4j_edge.get("object")
     swagger_edge.relation = neo4j_edge.get("relation")
     # Add additional properties on KG2 edges as swagger Attribute objects
     other_properties = [
         "provided_by", "publications", "negated", "relation_curie",
         "simplified_relation_curie", "simplified_relation", "edge_label"
     ]
     swagger_edge.attributes = self._create_swagger_attributes(
         other_properties, neo4j_edge)
     is_defined_by_attribute = Attribute(
         name="is_defined_by",
         value="ARAX/KG2",
         type=eu.get_attribute_type("is_defined_by"))
     swagger_edge.attributes.append(is_defined_by_attribute)
     return swagger_edge_key, swagger_edge
Exemplo n.º 7
0
    def _convert_to_swagger_edge(self, subject: str, object: str, name: str,
                                 value: float) -> Tuple[str, Edge]:
        swagger_edge = Edge()
        swagger_edge.predicate = f"biolink:{name}"
        swagger_edge.subject = subject
        swagger_edge.object = object
        swagger_edge_key = f"CHP:{subject}-{name}-{object}"
        swagger_edge.relation = None

        type = "EDAM:data_0951"
        url = "https://github.com/di2ag/chp_client"

        swagger_edge.attributes = [
            Attribute(type=type, name=name, value=str(value), url=url),
            Attribute(name="provided_by",
                      value=self.kp_name,
                      type=eu.get_attribute_type("provided_by")),
            Attribute(name="is_defined_by",
                      value="ARAX",
                      type=eu.get_attribute_type("is_defined_by"))
        ]
        return swagger_edge_key, swagger_edge
Exemplo n.º 8
0
 def _create_swagger_edge_from_kp_edge(
         self, kp_edge: Dict[str, any]) -> Tuple[str, Edge]:
     swagger_edge = Edge(subject=kp_edge['source_id'],
                         object=kp_edge['target_id'],
                         predicate=kp_edge['type'])
     swagger_edge.attributes = [
         Attribute(name="provided_by",
                   value=self.kp_name,
                   type=eu.get_attribute_type("provided_by")),
         Attribute(name="is_defined_by",
                   value="ARAX",
                   type=eu.get_attribute_type("is_defined_by"))
     ]
     score_name = kp_edge['score_name']
     score_value = kp_edge.get('score')
     if score_value:  # Some returned edges are missing a score value for whatever reason
         swagger_edge.attributes.append(
             Attribute(name=score_name,
                       type=self.score_type_lookup.get(
                           score_name, "biolink:Unknown"),
                       value=score_value))
     return kp_edge['id'], swagger_edge
Exemplo n.º 9
0
 def _create_ngd_edge(self, ngd_value: float, subject: str, object: str,
                      pmid_list: list) -> Tuple[str, Edge]:
     ngd_edge = Edge()
     ngd_edge.predicate = self.ngd_edge_predicate
     ngd_edge.subject = subject
     ngd_edge.object = object
     ngd_edge_key = f"NGD:{subject}--{ngd_edge.predicate}--{object}"
     ngd_edge.attributes = [
         Attribute(original_attribute_name=self.ngd_edge_attribute_name,
                   attribute_type_id=self.ngd_edge_attribute_type,
                   value=ngd_value)
     ]
     kp_description = "ARAX's in-house normalized google distance database."
     ngd_edge.attributes += [
         self.decorator.create_attribute("publications", pmid_list),
         eu.get_kp_source_attribute(
             "infores:arax-normalized-google-distance",
             arax_kp=True,
             description=kp_description),
         eu.get_arax_source_attribute(),
         eu.get_computed_value_attribute()
     ]
     return ngd_edge_key, ngd_edge
Exemplo n.º 10
0
    def predict_drug_treats_disease(self):
        """
        Iterate over all the edges in the knowledge graph, add the drug-disease treatment probability for appropriate edges
        on the edge_attributes
        :return: response
        """
        parameters = self.parameters
        self.response.debug(f"Computing drug disease treatment probability based on a machine learning model")
        self.response.info(f"Computing drug disease treatment probability based on a machine learning model: See [this publication](https://doi.org/10.1101/765305) for more details about how this is accomplished.")

        attribute_name = "probability_treats"
        attribute_type = "EDAM:data_0951"
        value = 0  # this will be the default value. If the model returns 0, or the default is there, don't include that edge
        url = "https://doi.org/10.1101/765305"

        # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them
        if 'virtual_relation_label' in parameters:
            source_curies_to_decorate = set()
            target_curies_to_decorate = set()
            curie_to_name = dict()
            # identify the nodes that we should be adding virtual edges for
            for node_key, node in self.message.knowledge_graph.nodes.items():
                if hasattr(node, 'qnode_keys'):
                    if parameters['subject_qnode_key'] in node.qnode_keys:
                        if "drug" in node.category or "chemical_substance" in node.category or "biolink:Drug" in node.category or "biolink:ChemicalSubstance" in node.category:  # this is now NOT checked by ARAX_overlay
                            source_curies_to_decorate.add(node_key)
                            curie_to_name[node_key] = node.name
                    if parameters['object_qnode_key'] in node.qnode_keys:
                        if "disease" in node.category or "phenotypic_feature" in node.category or "biolink:Disease" in node.category or "biolink:PhenotypicFeature" in node.category:  # this is now NOT checked by ARAX_overlay
                            target_curies_to_decorate.add(node_key)
                            curie_to_name[node_key] = node.name

            added_flag = False  # check to see if any edges where added
            # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute

            for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate):
                self.response.debug(f"Predicting probability that {curie_to_name[source_curie]} treats {curie_to_name[target_curie]}")
                # create the edge attribute if it can be
                # loop over all equivalent curies and take the highest probability

                max_probability = 0
                converted_source_curie = self.convert_to_trained_curies(source_curie)
                if converted_source_curie is None:
                    continue
                else:
                    preferred_type = converted_source_curie['preferred_type']
                    if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance":
                        converted_source_curie = converted_source_curie['preferred_curie']
                    else:
                        continue
                converted_target_curie = self.convert_to_trained_curies(target_curie)
                if converted_target_curie is None:
                    continue
                else:
                    preferred_type = converted_target_curie['preferred_type']
                    if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature":
                        converted_target_curie = converted_target_curie['preferred_curie']
                    else:
                        continue
                if self.use_prob_db is True:
                    probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie)
                    if probability is not None:
                        if np.isfinite(probability):
                            max_probability = probability
                else:
                    probability = self.pred.prob_single(converted_source_curie, converted_target_curie)
                    if probability is not None:
                        probability = probability[0]
                        if np.isfinite(probability):
                            max_probability = probability
                # if len(res) != 0:
                #     all_probabilities = self.pred.prob_all(res)
                #     if isinstance(all_probabilities, list):
                #         max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                value = max_probability

                #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                #    value = probability[0]
                edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url)  # populate the edge attribute
                if edge_attribute and value != 0:
                    added_flag = True
                    # make the edge, add the attribute

                    # edge properties
                    now = datetime.now()
                    edge_type = "biolink:probably_treats"
                    qedge_keys = [parameters['virtual_relation_label']]
                    relation = parameters['virtual_relation_label']
                    is_defined_by = "ARAX"
                    defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                    provided_by = "ARAX"
                    confidence = None
                    weight = None  # TODO: could make the actual value of the attribute
                    subject_key = source_curie
                    object_key = target_curie

                    # now actually add the virtual edges in
                    id = f"{relation}_{self.global_iter}"
                    self.global_iter += 1
                    edge_attribute_list = [
                        edge_attribute,
                        EdgeAttribute(name="is_defined_by", value=is_defined_by, type="ARAX_TYPE_PLACEHOLDER"),
                        EdgeAttribute(name="defined_datetime", value=defined_datetime, type="metatype:Datetime"),
                        EdgeAttribute(name="provided_by", value=provided_by, type="biolink:provided_by"),
                        #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"),
                        #EdgeAttribute(name="weight", value=weight, type="metatype:Float")
                    ]
                    edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation,
                                attributes=edge_attribute_list)
                    edge.qedge_keys = qedge_keys
                    self.message.knowledge_graph.edges[id] = edge

            # Now add a q_edge the query_graph since I've added an extra edge to the KG
            if added_flag:
                edge_type = "biolink:probably_treats"
                relation = parameters['virtual_relation_label']
                subject_qnode_key = parameters['subject_qnode_key']
                object_qnode_key = parameters['object_qnode_key']
                option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, self.message.query_graph, self.response)
                q_edge = QEdge(predicate=edge_type, relation=relation, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id)
                self.message.query_graph.edges[relation] = q_edge
            return self.response

        else:  # you want to add it for each edge in the KG
            # iterate over KG edges, add the information
            try:
                # map curies to types
                curie_to_type = dict()
                curie_to_name = dict()
                for node_key, node in self.message.knowledge_graph.nodes.items():
                    curie_to_type[node_key] = node.category
                    curie_to_name[node_key] = node.name
                # then iterate over the edges and decorate if appropriate
                for edge_key, edge in self.message.knowledge_graph.edges.items():
                    # Make sure the edge_attributes are not None
                    if not edge.attributes:
                        edge.attributes = []  # should be an array, but why not a list?
                    # now go and actually get the probability
                    source_curie = edge.subject
                    target_curie = edge.object
                    source_types = curie_to_type[source_curie]
                    target_types = curie_to_type[target_curie]
                    if (("drug" in source_types) or ("chemical_substance" in source_types) or ("biolink:Drug" in source_types) or ("biolink:ChemicalSubstance" in source_types)) and (("disease" in target_types) or ("phenotypic_feature" in target_types) or ("biolink:Disease" in target_types) or ("biolink:PhenotypicFeature" in target_types)):
                        # loop over all pairs of equivalent curies and take the highest probability
                        self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}")
                        max_probability = 0
                        converted_source_curie = self.convert_to_trained_curies(source_curie)
                        if converted_source_curie is None:
                            continue
                        else:
                            preferred_type = converted_source_curie['preferred_type']
                            if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance":
                                converted_source_curie = converted_source_curie['preferred_curie']
                            else:
                                continue
                        converted_target_curie = self.convert_to_trained_curies(target_curie)
                        if converted_target_curie is None:
                            continue
                        else:
                            preferred_type = converted_target_curie['preferred_type']
                            if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature":
                                converted_target_curie = converted_target_curie['preferred_curie']
                            else:
                                continue
                        if self.use_prob_db is True:
                            probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie)
                            if probability is not None:
                                if np.isfinite(probability):
                                    max_probability = probability
                        else:
                            probability = self.pred.prob_single(converted_source_curie, converted_target_curie)
                            if probability is not None:
                                probability = probability[0]
                                if np.isfinite(probability):
                                    max_probability = probability
                        # res = list(itertools.product(converted_source_curie, converted_target_curie))
                        # if len(res) != 0:
                        #     all_probabilities = self.pred.prob_all(res)
                        #     if isinstance(all_probabilities, list):
                        #         max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                        value = max_probability

                        #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                        #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                        #    value = probability[0]
                    elif (("drug" in target_types) or ("chemical_substance" in target_types) or ("biolink:Drug" in target_types) or ("biolink:ChemicalSubstance" in target_types)) and (("disease" in source_types) or ("phenotypic_feature" in source_types) or ("biolink:Disease" in source_types) or ("biolink:PhenotypicFeature" in source_types)):
                        #probability = self.pred.prob_single('ChEMBL:' + target_curie[22:], source_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                        #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                        #    value = probability[0]
                        self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}")
                        max_probability = 0
                        converted_source_curie = self.convert_to_trained_curies(source_curie)
                        if converted_source_curie is None:
                            continue
                        else:
                            preferred_type = converted_source_curie['preferred_type']
                            if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature":
                                converted_source_curie = converted_source_curie['preferred_curie']
                            else:
                                continue
                        converted_target_curie = self.convert_to_trained_curies(target_curie)
                        if converted_target_curie is None:
                            continue
                        else:
                            preferred_type = converted_target_curie['preferred_type']
                            if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance":
                                converted_target_curie = converted_target_curie['preferred_curie']
                            else:
                                continue

                        if self.use_prob_db is True:
                            probability = self.pred.get_prob_from_DTD_db(converted_target_curie, converted_source_curie)
                            if probability is not None:
                                if np.isfinite(probability):
                                    max_probability = probability
                        else:
                            probability = self.pred.prob_single(converted_target_curie, converted_source_curie)
                            if probability is not None:
                                probability = probability[0]
                                if np.isfinite(probability):
                                    max_probability = probability
                        # res = list(itertools.product(converted_target_curie, converted_source_curie))
                        # if len(res) != 0:
                        #     all_probabilities = self.pred.prob_all(res)
                        #     if isinstance(all_probabilities, list):
                        #         max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                        value = max_probability

                    else:
                        continue
                    if value != 0:
                        edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url)  # populate the attribute
                        edge.attributes.append(edge_attribute)  # append it to the list of attributes
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong adding the drug disease treatment probability")
            else:
                self.response.info(f"Drug disease treatment probability successfully added to edges")

            return self.response
Exemplo n.º 11
0
    def compute_ngd(self):
        """
        Iterate over all the edges in the knowledge graph, compute the normalized google distance and stick that info
        on the attributes
        :default: The default value to set for NGD if it returns a nan
        :return: response
        """
        if self.response.status != 'OK':  # Catches any errors that may have been logged during initialization
            self._close_database()
            return self.response
        parameters = self.parameters
        self.response.debug(f"Computing NGD")
        self.response.info(f"Computing the normalized Google distance: weighting edges based on subject/object node "
                           f"co-occurrence frequency in PubMed abstracts")
        name = "normalized_google_distance"
        type = "EDAM:data_2526"
        value = self.parameters['default_value']
        url = "https://arax.ncats.io/api/rtx/v1/ui/#/PubmedMeshNgd"
        qg = self.message.query_graph
        kg = self.message.knowledge_graph

        # if you want to add virtual edges, identify the subject/objects, decorate the edges, add them to the KG, and then add one to the QG corresponding to them
        if 'virtual_relation_label' in parameters:
            # Figure out which node pairs to compute NGD between
            subject_qnode_key = parameters['subject_qnode_key']
            object_qnode_key = parameters['object_qnode_key']
            node_pairs_to_evaluate = ou.get_node_pairs_to_overlay(subject_qnode_key, object_qnode_key, qg, kg, self.response)
            # Grab PMID lists for all involved nodes
            involved_curies = {curie for node_pair in node_pairs_to_evaluate for curie in node_pair}
            canonicalized_curie_lookup = self._get_canonical_curies_map(list(involved_curies))
            self.load_curie_to_pmids_data(canonicalized_curie_lookup.values())
            added_flag = False  # check to see if any edges where added
            self.response.debug(f"Looping through {len(node_pairs_to_evaluate)} node pairs and calculating NGD values")
            # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute
            for (subject_curie, object_curie) in node_pairs_to_evaluate:
                # create the edge attribute if it can be
                canonical_subject_curie = canonicalized_curie_lookup.get(subject_curie, subject_curie)
                canonical_object_curie = canonicalized_curie_lookup.get(object_curie, object_curie)
                ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie)
                if np.isfinite(ngd_value):  # if ngd is finite, that's ok, otherwise, stay with default
                    value = ngd_value
                edge_attribute = EdgeAttribute(type=type, name=name, value=str(value), url=url)  # populate the NGD edge attribute
                pmid_attribute = EdgeAttribute(type="biolink:publications", name="publications", value=[f"PMID:{pmid}" for pmid in pmid_set])
                if edge_attribute:
                    added_flag = True
                    # make the edge, add the attribute

                    # edge properties
                    now = datetime.now()
                    edge_type = "biolink:has_normalized_google_distance_with"
                    qedge_keys = [parameters['virtual_relation_label']]
                    relation = parameters['virtual_relation_label']
                    is_defined_by = "ARAX"
                    defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                    provided_by = "ARAX"
                    confidence = None
                    weight = None  # TODO: could make the actual value of the attribute
                    subject_key = subject_curie
                    object_key = object_curie

                    # now actually add the virtual edges in
                    id = f"{relation}_{self.global_iter}"
                    # ensure the id is unique
                    # might need to change after expand is implemented for TRAPI 1.0
                    while id in self.message.knowledge_graph.edges:
                        id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}"
                    self.global_iter += 1
                    edge_attribute_list = [
                        edge_attribute,
                        pmid_attribute,
                        EdgeAttribute(name="is_defined_by", value=is_defined_by, type="ARAX_TYPE_PLACEHOLDER"),
                        EdgeAttribute(name="defined_datetime", value=defined_datetime, type="metatype:Datetime"),
                        EdgeAttribute(name="provided_by", value=provided_by, type="biolink:provided_by"),
                        #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"),
                        #EdgeAttribute(name="weight", value=weight, type="metatype:Float"),
                        #EdgeAttribute(name="qedge_keys", value=qedge_keys)
                    ]
                    # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key,
                    #             object_key=object_key,
                    #             is_defined_by=is_defined_by, defined_datetime=defined_datetime,
                    #             provided_by=provided_by,
                    #             confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids)
                    edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation,
                                attributes=edge_attribute_list)
                    edge.qedge_keys = qedge_keys
                    self.message.knowledge_graph.edges[id] = edge

            # Now add a q_edge the query_graph since I've added an extra edge to the KG
            if added_flag:
                #edge_type = parameters['virtual_edge_type']
                edge_type = "biolink:has_normalized_google_distance_with"
                relation = parameters['virtual_relation_label']
                option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, qg, self.response)
                # q_edge = QEdge(id=relation, type=edge_type, relation=relation,
                #                subject_key=subject_qnode_key, object_key=object_qnode_key,
                #                option_group_id=option_group_id)
                q_edge = QEdge(predicate=edge_type, relation=relation, subject=subject_qnode_key,
                           object=object_qnode_key, option_group_id=option_group_id)
                self.message.query_graph.edges[relation]=q_edge

            self.response.info(f"NGD values successfully added to edges")
        else:  # you want to add it for each edge in the KG
            # iterate over KG edges, add the information
            try:
                # Map all nodes to their canonicalized curies in one batch (need canonical IDs for the local NGD system)
                canonicalized_curie_map = self._get_canonical_curies_map([key for key in self.message.knowledge_graph.nodes.keys()])
                self.load_curie_to_pmids_data(canonicalized_curie_map.values())
                self.response.debug(f"Looping through edges and calculating NGD values")
                for edge in self.message.knowledge_graph.edges.values():
                    # Make sure the attributes are not None
                    if not edge.attributes:
                        edge.attributes = []  # should be an array, but why not a list?
                    # now go and actually get the NGD
                    subject_curie = edge.subject
                    object_curie = edge.object
                    canonical_subject_curie = canonicalized_curie_map.get(subject_curie, subject_curie)
                    canonical_object_curie = canonicalized_curie_map.get(object_curie, object_curie)
                    ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie)
                    if np.isfinite(ngd_value):  # if ngd is finite, that's ok, otherwise, stay with default
                        value = ngd_value
                    ngd_edge_attribute = EdgeAttribute(type=type, name=name, value=str(value), url=url)  # populate the NGD edge attribute
                    pmid_edge_attribute = EdgeAttribute(type="biolink:publications", name="ngd_publications", value=[f"PMID:{pmid}" for pmid in pmid_set])
                    edge.attributes.append(ngd_edge_attribute)  # append it to the list of attributes
                    edge.attributes.append(pmid_edge_attribute)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong adding the NGD edge attributes")
            else:
                self.response.info(f"NGD values successfully added to edges")
            self._close_database()
            return self.response
Exemplo n.º 12
0
    def compute_ngd(self):
        """
        Iterate over all the edges in the knowledge graph, compute the normalized google distance and stick that info
        on the attributes
        :default: The default value to set for NGD if it returns a nan
        :return: response
        """
        if self.response.status != 'OK':  # Catches any errors that may have been logged during initialization
            self._close_database()
            return self.response
        parameters = self.parameters
        self.response.debug(f"Computing NGD")
        self.response.info(f"Computing the normalized Google distance: weighting edges based on subject/object node "
                           f"co-occurrence frequency in PubMed abstracts")
        name = "normalized_google_distance"
        type = "EDAM:data_2526"
        default_value = self.parameters['default_value']
        url = "https://arax.ncats.io/api/rtx/v1/ui/#/PubmedMeshNgd"
        qg = self.message.query_graph
        kg = self.message.knowledge_graph
        ngd_description = """
        Normalized google distance is a metric based on edge subject/object node co-occurrence in abstracts of all [PubMed](https://pubmed.ncbi.nlm.nih.gov/) articles. 
        The formula can be found here on [wikipedia.](https://en.wikipedia.org/wiki/Normalized_Google_distance) 
        Where in this case f(x,y) is the number of PubMed abstracts both concepts apear in, f(x)/f(y) are the number of abstracts individual concepts apear in, and N is the number of pubmed articles times the average numbver of search terms per article (27 million * 20).
        """
        
        # if you want to add virtual edges, identify the subject/objects, decorate the edges, add them to the KG, and then add one to the QG corresponding to them
        # FW: changing this so if there is a virtual relation label but no subject and object then add edges for all subject object pairs in the quesry graph.
        if 'subject_qnode_key' not in parameters and 'object_qnode_key' not in parameters and 'virtual_relation_label' in parameters:
            seen_node_pairs = set()
            qgraph_edges = copy.deepcopy(list(qg.edges.values()))
            for query_edge in qgraph_edges:
                subject_qnode_key = query_edge.subject
                object_qnode_key = query_edge.object
                if subject_qnode_key < object_qnode_key:
                    qnode_key_pair = (subject_qnode_key,object_qnode_key)
                else:
                    qnode_key_pair = (object_qnode_key,subject_qnode_key)
                # FW: check if we have already added an edge for this pair
                if qnode_key_pair in seen_node_pairs:
                    pass
                else:
                    seen_node_pairs.add(qnode_key_pair)
                    # FW: Now add the edge for this qnode pair
                    # FW NOTE: If we decide to keep these changes we should really pull this out into a method as everything after this was copy pasted from below in the 'virtual_relation_label' in parameters section
                    node_pairs_to_evaluate = ou.get_node_pairs_to_overlay(subject_qnode_key, object_qnode_key, qg, kg, self.response)
                    # Grab PMID lists for all involved nodes
                    involved_curies = {curie for node_pair in node_pairs_to_evaluate for curie in node_pair}
                    canonicalized_curie_lookup = self._get_canonical_curies_map(list(involved_curies))
                    self.load_curie_to_pmids_data(canonicalized_curie_lookup.values())
                    added_flag = False  # check to see if any edges where added
                    self.response.debug(f"Looping through {len(node_pairs_to_evaluate)} node pairs and calculating NGD values")
                    # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute
                    for (subject_curie, object_curie) in node_pairs_to_evaluate:
                        # create the edge attribute if it can be
                        canonical_subject_curie = canonicalized_curie_lookup.get(subject_curie, subject_curie)
                        canonical_object_curie = canonicalized_curie_lookup.get(object_curie, object_curie)
                        ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie)
                        if np.isfinite(ngd_value):  # if ngd is finite, that's ok, otherwise, stay with default
                            edge_value = ngd_value
                        else:
                            edge_value = default_value
                        edge_attribute = EdgeAttribute(attribute_type_id=type, original_attribute_name=name, value=str(edge_value), value_url=url, description=ngd_description)  # populate the NGD edge attribute
                        pmid_attribute = EdgeAttribute(attribute_type_id="biolink:publications", original_attribute_name="publications", value=[f"PMID:{pmid}" for pmid in pmid_set])
                        if edge_attribute:
                            added_flag = True
                            # make the edge, add the attribute

                            # edge properties
                            now = datetime.now()
                            edge_type = "biolink:has_normalized_google_distance_with"
                            qedge_keys = [parameters['virtual_relation_label']]
                            relation = parameters['virtual_relation_label']
                            is_defined_by = "ARAX"
                            defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                            provided_by = "infores:arax"
                            confidence = None
                            weight = None  # TODO: could make the actual value of the attribute
                            subject_key = subject_curie
                            object_key = object_curie

                            # now actually add the virtual edges in
                            id = f"{relation}_{self.global_iter}"
                            # ensure the id is unique
                            # might need to change after expand is implemented for TRAPI 1.0
                            while id in self.message.knowledge_graph.edges:
                                id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}"
                            self.global_iter += 1
                            edge_attribute_list = [
                                edge_attribute,
                                pmid_attribute,
                                EdgeAttribute(original_attribute_name="virtual_relation_label", value=relation, attribute_type_id="biolink:Unknown"),
                                #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"),
                                EdgeAttribute(original_attribute_name="defined_datetime", value=defined_datetime, attribute_type_id="metatype:Datetime"),
                                EdgeAttribute(original_attribute_name="provided_by", value=provided_by, attribute_type_id="biolink:aggregator_knowledge_source", attribute_source=provided_by, value_type_id="biolink:InformationResource"),
                                EdgeAttribute(original_attribute_name=None, value=True, attribute_type_id="biolink:computed_value", attribute_source="infores:arax-reasoner-ara", value_type_id="metatype:Boolean", value_url=None, description="This edge is a container for a computed value between two nodes that is not directly attachable to other edges.")
                                #EdgeAttribute(original_attribute_name="confidence", value=confidence, attribute_type_id="biolink:ConfidenceLevel"),
                                #EdgeAttribute(original_attribute_name="weight", value=weight, attribute_type_id="metatype:Float"),
                                #EdgeAttribute(original_attribute_name="qedge_keys", value=qedge_keys)
                            ]
                            # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key,
                            #             object_key=object_key,
                            #             is_defined_by=is_defined_by, defined_datetime=defined_datetime,
                            #             provided_by=provided_by,
                            #             confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids)

                            #### FIXME temporary hack by EWD
                            #edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation,
                            #            attributes=edge_attribute_list)
                            edge = Edge(predicate=edge_type, subject=subject_key, object=object_key,
                                        attributes=edge_attribute_list)
                            #edge.relation = relation
                            #### /end FIXME

                            edge.qedge_keys = qedge_keys
                            self.message.knowledge_graph.edges[id] = edge

                            #FW: check if results exist then modify them with the ngd edge
                            if self.message.results is not None and len(self.message.results) > 0:
                                ou.update_results_with_overlay_edge(subject_knode_key=subject_key, object_knode_key=object_key, kedge_key=id, message=self.message, log=self.response)

                    # Now add a q_edge the query_graph since I've added an extra edge to the KG
                    if added_flag:
                        #edge_type = parameters['virtual_edge_type']
                        edge_type = [ "biolink:has_normalized_google_distance_with" ]
                        relation = parameters['virtual_relation_label']
                        option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, qg, self.response)
                        # q_edge = QEdge(id=relation, type=edge_type, relation=relation,
                        #                subject_key=subject_qnode_key, object_key=object_qnode_key,
                        #                option_group_id=option_group_id)

                        #### FIXME by EWD. For later fixing
                        #q_edge = QEdge(predicates=edge_type, relation=relation, subject=subject_qnode_key,
                        #           object=object_qnode_key, option_group_id=option_group_id)
                        q_edge = QEdge(predicates=edge_type, subject=subject_qnode_key,
                                   object=object_qnode_key, option_group_id=option_group_id)
                        q_edge.relation = relation
                        #### end FIXME

                        self.message.query_graph.edges[relation]=q_edge


                    self.response.info(f"NGD values successfully added to edges for the qnode pair ({subject_qnode_key},{object_qnode_key})")

        elif 'virtual_relation_label' in parameters:
            # Figure out which node pairs to compute NGD between
            subject_qnode_key = parameters['subject_qnode_key']
            object_qnode_key = parameters['object_qnode_key']
            node_pairs_to_evaluate = ou.get_node_pairs_to_overlay(subject_qnode_key, object_qnode_key, qg, kg, self.response)
            # Grab PMID lists for all involved nodes
            involved_curies = {curie for node_pair in node_pairs_to_evaluate for curie in node_pair}
            canonicalized_curie_lookup = self._get_canonical_curies_map(list(involved_curies))
            self.load_curie_to_pmids_data(canonicalized_curie_lookup.values())
            added_flag = False  # check to see if any edges where added
            self.response.debug(f"Looping through {len(node_pairs_to_evaluate)} node pairs and calculating NGD values")
            # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute
            for (subject_curie, object_curie) in node_pairs_to_evaluate:
                # create the edge attribute if it can be
                canonical_subject_curie = canonicalized_curie_lookup.get(subject_curie, subject_curie)
                canonical_object_curie = canonicalized_curie_lookup.get(object_curie, object_curie)
                ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie)
                if np.isfinite(ngd_value):  # if ngd is finite, that's ok, otherwise, stay with default
                    edge_value = ngd_value
                else:
                    edge_value = default_value
                edge_attribute = EdgeAttribute(attribute_type_id=type, original_attribute_name=name, value=str(edge_value), value_url=url, description=ngd_description)  # populate the NGD edge attribute
                pmid_attribute = EdgeAttribute(attribute_type_id="biolink:publications", original_attribute_name="publications", value=[f"PMID:{pmid}" for pmid in pmid_set])
                if edge_attribute:
                    added_flag = True
                    # make the edge, add the attribute

                    # edge properties
                    now = datetime.now()
                    edge_type = "biolink:has_normalized_google_distance_with"
                    qedge_keys = [parameters['virtual_relation_label']]
                    relation = parameters['virtual_relation_label']
                    is_defined_by = "ARAX"
                    defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                    provided_by = "infores:arax"
                    confidence = None
                    weight = None  # TODO: could make the actual value of the attribute
                    subject_key = subject_curie
                    object_key = object_curie

                    # now actually add the virtual edges in
                    id = f"{relation}_{self.global_iter}"
                    # ensure the id is unique
                    # might need to change after expand is implemented for TRAPI 1.0
                    while id in self.message.knowledge_graph.edges:
                        id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}"
                    self.global_iter += 1
                    edge_attribute_list = [
                        edge_attribute,
                        pmid_attribute,
                        EdgeAttribute(original_attribute_name="virtual_relation_label", value=relation, attribute_type_id="biolink:Unknown"),
                        #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"),
                        EdgeAttribute(original_attribute_name="defined_datetime", value=defined_datetime, attribute_type_id="metatype:Datetime"),
                        EdgeAttribute(original_attribute_name="provided_by", value=provided_by, attribute_type_id="biolink:aggregator_knowledge_source", attribute_source=provided_by, value_type_id="biolink:InformationResource"),
                        EdgeAttribute(original_attribute_name=None, value=True, attribute_type_id="biolink:computed_value", attribute_source="infores:arax-reasoner-ara", value_type_id="metatype:Boolean", value_url=None, description="This edge is a container for a computed value between two nodes that is not directly attachable to other edges.")
                        #EdgeAttribute(original_attribute_name="confidence", value=confidence, attribute_type_id="biolink:ConfidenceLevel"),
                        #EdgeAttribute(original_attribute_name="weight", value=weight, attribute_type_id="metatype:Float"),
                        #EdgeAttribute(original_attribute_name="qedge_keys", value=qedge_keys)
                    ]
                    # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key,
                    #             object_key=object_key,
                    #             is_defined_by=is_defined_by, defined_datetime=defined_datetime,
                    #             provided_by=provided_by,
                    #             confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids)

                    #### FIXME temporary hack by EWD
                    #edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation,
                    #            attributes=edge_attribute_list)
                    edge = Edge(predicate=edge_type, subject=subject_key, object=object_key,
                                attributes=edge_attribute_list)
                    #edge.relation = relation
                    #### /end FIXME

                    edge.qedge_keys = qedge_keys
                    self.message.knowledge_graph.edges[id] = edge

                    #FW: check if results exist then modify them with the ngd edge
                    if self.message.results is not None and len(self.message.results) > 0:
                        ou.update_results_with_overlay_edge(subject_knode_key=subject_key, object_knode_key=object_key, kedge_key=id, message=self.message, log=self.response)

            # Now add a q_edge the query_graph since I've added an extra edge to the KG
            if added_flag:
                #edge_type = parameters['virtual_edge_type']
                edge_type = [ "biolink:has_normalized_google_distance_with" ]
                relation = parameters['virtual_relation_label']
                option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, qg, self.response)
                # q_edge = QEdge(id=relation, type=edge_type, relation=relation,
                #                subject_key=subject_qnode_key, object_key=object_qnode_key,
                #                option_group_id=option_group_id)

                #### FIXME by EWD. For later fixing
                #q_edge = QEdge(predicates=edge_type, relation=relation, subject=subject_qnode_key,
                #           object=object_qnode_key, option_group_id=option_group_id)
                q_edge = QEdge(predicates=edge_type, subject=subject_qnode_key,
                           object=object_qnode_key, option_group_id=option_group_id)
                q_edge.relation = relation
                #### end FIXME

                self.message.query_graph.edges[relation]=q_edge


            self.response.info(f"NGD values successfully added to edges")
        else:  # you want to add it for each edge in the KG
            # iterate over KG edges, add the information
            try:
                # Map all nodes to their canonicalized curies in one batch (need canonical IDs for the local NGD system)
                canonicalized_curie_map = self._get_canonical_curies_map([key for key in self.message.knowledge_graph.nodes.keys()])
                self.load_curie_to_pmids_data(canonicalized_curie_map.values())
                self.response.debug(f"Looping through edges and calculating NGD values")
                for edge in self.message.knowledge_graph.edges.values():
                    # Make sure the attributes are not None
                    if not edge.attributes:
                        edge.attributes = []  # should be an array, but why not a list?
                    # now go and actually get the NGD
                    subject_curie = edge.subject
                    object_curie = edge.object
                    canonical_subject_curie = canonicalized_curie_map.get(subject_curie, subject_curie)
                    canonical_object_curie = canonicalized_curie_map.get(object_curie, object_curie)
                    ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie)
                    if np.isfinite(ngd_value):  # if ngd is finite, that's ok, otherwise, stay with default
                        edge_value = ngd_value
                    else:
                        edge_value = default_value
                    ngd_edge_attribute = EdgeAttribute(attribute_type_id=type, original_attribute_name=name, value=str(edge_value), value_url=url, description=ngd_description)  # populate the NGD edge attribute
                    pmid_edge_attribute = EdgeAttribute(attribute_type_id="biolink:publications", original_attribute_name="ngd_publications", value_type_id="EDAM:data_1187", value=[f"PMID:{pmid}" for pmid in pmid_set])
                    edge.attributes.append(ngd_edge_attribute)  # append it to the list of attributes
                    edge.attributes.append(pmid_edge_attribute)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong adding the NGD edge attributes")
            else:
                self.response.info(f"NGD values successfully added to edges")
            self._close_database()
        return self.response