def _create_ngd_edge(self, ngd_value: float, subject: str, object: str, pmid_list: list) -> Tuple[str, Edge]: ngd_edge = Edge() ngd_edge.predicate = self.ngd_edge_type ngd_edge.subject = subject ngd_edge.object = object ngd_edge_key = f"NGD:{subject}--{ngd_edge.predicate}--{object}" ngd_edge.attributes = [ Attribute(name=self.ngd_edge_attribute_name, type=self.ngd_edge_attribute_type, value=ngd_value, url=self.ngd_edge_attribute_url) ] ngd_edge.attributes += [ Attribute(name="provided_by", value="ARAX", type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="ARAX", type=eu.get_attribute_type("is_defined_by")), Attribute(name="publications", value=pmid_list, type=eu.get_attribute_type("publications")) ] return ngd_edge_key, ngd_edge
def _add_answers_to_kg(self, answer_kg: QGOrganizedKnowledgeGraph, reasoner_std_response: Dict[str, any], input_qnode_key: str, output_qnode_key: str, qedge_key: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict(reasoner_std_response['results']) if reasoner_std_response['knowledge_graph']['edges']: remapped_node_keys = dict() log.debug(f"Got results back from BTE for this query " f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)") for node in reasoner_std_response['knowledge_graph']['nodes']: swagger_node = Node() bte_node_key = node.get('id') swagger_node.name = node.get('name') swagger_node.category = eu.convert_to_list(eu.convert_string_to_snake_case(node.get('type'))) # Map the returned BTE qg_ids back to the original qnode_keys in our query graph bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_key) if bte_qg_id == "n0": qnode_key = input_qnode_key elif bte_qg_id == "n1": qnode_key = output_qnode_key else: log.error("Could not map BTE qg_id to ARAX qnode_key", error_code="UnknownQGID") return answer_kg # Find and use the preferred equivalent identifier for this node (if it's an output node) if qnode_key == output_qnode_key: if bte_node_key in remapped_node_keys: swagger_node_key = remapped_node_keys.get(bte_node_key) else: equivalent_curies = [f"{prefix}:{eu.get_curie_local_id(local_id)}" for prefix, local_ids in node.get('equivalent_identifiers').items() for local_id in local_ids] swagger_node_key = self._get_best_equivalent_bte_curie(equivalent_curies, swagger_node.category[0]) remapped_node_keys[bte_node_key] = swagger_node_key else: swagger_node_key = bte_node_key answer_kg.add_node(swagger_node_key, swagger_node, qnode_key) for edge in reasoner_std_response['knowledge_graph']['edges']: swagger_edge = Edge() swagger_edge_key = edge.get("id") swagger_edge.predicate = edge.get('type') swagger_edge.subject = remapped_node_keys.get(edge.get('source_id'), edge.get('source_id')) swagger_edge.object = remapped_node_keys.get(edge.get('target_id'), edge.get('target_id')) swagger_edge.attributes = [Attribute(name="provided_by", value=edge.get('edge_source'), type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="BTE", type=eu.get_attribute_type("is_defined_by"))] # Map the returned BTE qg_id back to the original qedge_key in our query graph bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge_key) if bte_qg_id != "e1": log.error("Could not map BTE qg_id to ARAX qedge_key", error_code="UnknownQGID") return answer_kg answer_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) return answer_kg
def _create_swagger_edge_from_kp_edge(self, kp_edge_key: str, kp_edge: Dict[str, any]) -> Edge: swagger_edge = Edge(subject=kp_edge['subject'], object=kp_edge['object'], predicate=kp_edge['predicate']) swagger_edge.attributes = [ Attribute(name="provided_by", value=self.kp_name, type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="ARAX", type=eu.get_attribute_type("is_defined_by")) ] return kp_edge_key, swagger_edge
def _convert_kg2c_edge_to_swagger_edge( self, neo4j_edge: Dict[str, any]) -> Tuple[str, Edge]: swagger_edge = Edge() swagger_edge_key = f"KG2c:{neo4j_edge.get('id')}" swagger_edge.predicate = neo4j_edge.get("simplified_edge_label") swagger_edge.subject = neo4j_edge.get("subject") swagger_edge.object = neo4j_edge.get("object") other_properties = ["provided_by", "publications"] swagger_edge.attributes = self._create_swagger_attributes( other_properties, neo4j_edge) is_defined_by_attribute = Attribute( name="is_defined_by", value="ARAX/KG2c", type=eu.get_attribute_type("is_defined_by")) swagger_edge.attributes.append(is_defined_by_attribute) return swagger_edge_key, swagger_edge
def _convert_kg1_edge_to_swagger_edge( self, neo4j_edge: Dict[str, any], node_uuid_to_curie_dict: Dict[str, str]) -> Tuple[str, Edge]: swagger_edge = Edge() swagger_edge_key = f"KG1:{neo4j_edge.get('id')}" swagger_edge.predicate = neo4j_edge.get("predicate") swagger_edge.subject = node_uuid_to_curie_dict[neo4j_edge.get( "source_node_uuid")] swagger_edge.object = node_uuid_to_curie_dict[neo4j_edge.get( "target_node_uuid")] swagger_edge.relation = neo4j_edge.get("relation") other_properties = ["provided_by", "probability"] swagger_edge.attributes = self._create_swagger_attributes( other_properties, neo4j_edge) is_defined_by_attribute = Attribute( name="is_defined_by", value="ARAX/KG1", type=eu.get_attribute_type("is_defined_by")) swagger_edge.attributes.append(is_defined_by_attribute) return swagger_edge_key, swagger_edge
def _convert_kg2_edge_to_swagger_edge(self, neo4j_edge: Dict[str, any]) -> Edge: swagger_edge = Edge() swagger_edge_key = f"KG2:{neo4j_edge.get('id')}" swagger_edge.predicate = neo4j_edge.get("simplified_edge_label") swagger_edge.subject = neo4j_edge.get("subject") swagger_edge.object = neo4j_edge.get("object") swagger_edge.relation = neo4j_edge.get("relation") # Add additional properties on KG2 edges as swagger Attribute objects other_properties = [ "provided_by", "publications", "negated", "relation_curie", "simplified_relation_curie", "simplified_relation", "edge_label" ] swagger_edge.attributes = self._create_swagger_attributes( other_properties, neo4j_edge) is_defined_by_attribute = Attribute( name="is_defined_by", value="ARAX/KG2", type=eu.get_attribute_type("is_defined_by")) swagger_edge.attributes.append(is_defined_by_attribute) return swagger_edge_key, swagger_edge
def _convert_to_swagger_edge(self, subject: str, object: str, name: str, value: float) -> Tuple[str, Edge]: swagger_edge = Edge() swagger_edge.predicate = f"biolink:{name}" swagger_edge.subject = subject swagger_edge.object = object swagger_edge_key = f"CHP:{subject}-{name}-{object}" swagger_edge.relation = None type = "EDAM:data_0951" url = "https://github.com/di2ag/chp_client" swagger_edge.attributes = [ Attribute(type=type, name=name, value=str(value), url=url), Attribute(name="provided_by", value=self.kp_name, type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="ARAX", type=eu.get_attribute_type("is_defined_by")) ] return swagger_edge_key, swagger_edge
def _create_swagger_edge_from_kp_edge( self, kp_edge: Dict[str, any]) -> Tuple[str, Edge]: swagger_edge = Edge(subject=kp_edge['source_id'], object=kp_edge['target_id'], predicate=kp_edge['type']) swagger_edge.attributes = [ Attribute(name="provided_by", value=self.kp_name, type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="ARAX", type=eu.get_attribute_type("is_defined_by")) ] score_name = kp_edge['score_name'] score_value = kp_edge.get('score') if score_value: # Some returned edges are missing a score value for whatever reason swagger_edge.attributes.append( Attribute(name=score_name, type=self.score_type_lookup.get( score_name, "biolink:Unknown"), value=score_value)) return kp_edge['id'], swagger_edge
def _create_ngd_edge(self, ngd_value: float, subject: str, object: str, pmid_list: list) -> Tuple[str, Edge]: ngd_edge = Edge() ngd_edge.predicate = self.ngd_edge_predicate ngd_edge.subject = subject ngd_edge.object = object ngd_edge_key = f"NGD:{subject}--{ngd_edge.predicate}--{object}" ngd_edge.attributes = [ Attribute(original_attribute_name=self.ngd_edge_attribute_name, attribute_type_id=self.ngd_edge_attribute_type, value=ngd_value) ] kp_description = "ARAX's in-house normalized google distance database." ngd_edge.attributes += [ self.decorator.create_attribute("publications", pmid_list), eu.get_kp_source_attribute( "infores:arax-normalized-google-distance", arax_kp=True, description=kp_description), eu.get_arax_source_attribute(), eu.get_computed_value_attribute() ] return ngd_edge_key, ngd_edge
def predict_drug_treats_disease(self): """ Iterate over all the edges in the knowledge graph, add the drug-disease treatment probability for appropriate edges on the edge_attributes :return: response """ parameters = self.parameters self.response.debug(f"Computing drug disease treatment probability based on a machine learning model") self.response.info(f"Computing drug disease treatment probability based on a machine learning model: See [this publication](https://doi.org/10.1101/765305) for more details about how this is accomplished.") attribute_name = "probability_treats" attribute_type = "EDAM:data_0951" value = 0 # this will be the default value. If the model returns 0, or the default is there, don't include that edge url = "https://doi.org/10.1101/765305" # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them if 'virtual_relation_label' in parameters: source_curies_to_decorate = set() target_curies_to_decorate = set() curie_to_name = dict() # identify the nodes that we should be adding virtual edges for for node_key, node in self.message.knowledge_graph.nodes.items(): if hasattr(node, 'qnode_keys'): if parameters['subject_qnode_key'] in node.qnode_keys: if "drug" in node.category or "chemical_substance" in node.category or "biolink:Drug" in node.category or "biolink:ChemicalSubstance" in node.category: # this is now NOT checked by ARAX_overlay source_curies_to_decorate.add(node_key) curie_to_name[node_key] = node.name if parameters['object_qnode_key'] in node.qnode_keys: if "disease" in node.category or "phenotypic_feature" in node.category or "biolink:Disease" in node.category or "biolink:PhenotypicFeature" in node.category: # this is now NOT checked by ARAX_overlay target_curies_to_decorate.add(node_key) curie_to_name[node_key] = node.name added_flag = False # check to see if any edges where added # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate): self.response.debug(f"Predicting probability that {curie_to_name[source_curie]} treats {curie_to_name[target_curie]}") # create the edge attribute if it can be # loop over all equivalent curies and take the highest probability max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) if converted_source_curie is None: continue else: preferred_type = converted_source_curie['preferred_type'] if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance": converted_source_curie = converted_source_curie['preferred_curie'] else: continue converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_target_curie is None: continue else: preferred_type = converted_target_curie['preferred_type'] if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature": converted_target_curie = converted_target_curie['preferred_curie'] else: continue if self.use_prob_db is True: probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie) if probability is not None: if np.isfinite(probability): max_probability = probability else: probability = self.pred.prob_single(converted_source_curie, converted_target_curie) if probability is not None: probability = probability[0] if np.isfinite(probability): max_probability = probability # if len(res) != 0: # all_probabilities = self.pred.prob_all(res) # if isinstance(all_probabilities, list): # max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url) # populate the edge attribute if edge_attribute and value != 0: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "biolink:probably_treats" qedge_keys = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute subject_key = source_curie object_key = target_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" self.global_iter += 1 edge_attribute_list = [ edge_attribute, EdgeAttribute(name="is_defined_by", value=is_defined_by, type="ARAX_TYPE_PLACEHOLDER"), EdgeAttribute(name="defined_datetime", value=defined_datetime, type="metatype:Datetime"), EdgeAttribute(name="provided_by", value=provided_by, type="biolink:provided_by"), #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"), #EdgeAttribute(name="weight", value=weight, type="metatype:Float") ] edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation, attributes=edge_attribute_list) edge.qedge_keys = qedge_keys self.message.knowledge_graph.edges[id] = edge # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: edge_type = "biolink:probably_treats" relation = parameters['virtual_relation_label'] subject_qnode_key = parameters['subject_qnode_key'] object_qnode_key = parameters['object_qnode_key'] option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, self.message.query_graph, self.response) q_edge = QEdge(predicate=edge_type, relation=relation, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) self.message.query_graph.edges[relation] = q_edge return self.response else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: # map curies to types curie_to_type = dict() curie_to_name = dict() for node_key, node in self.message.knowledge_graph.nodes.items(): curie_to_type[node_key] = node.category curie_to_name[node_key] = node.name # then iterate over the edges and decorate if appropriate for edge_key, edge in self.message.knowledge_graph.edges.items(): # Make sure the edge_attributes are not None if not edge.attributes: edge.attributes = [] # should be an array, but why not a list? # now go and actually get the probability source_curie = edge.subject target_curie = edge.object source_types = curie_to_type[source_curie] target_types = curie_to_type[target_curie] if (("drug" in source_types) or ("chemical_substance" in source_types) or ("biolink:Drug" in source_types) or ("biolink:ChemicalSubstance" in source_types)) and (("disease" in target_types) or ("phenotypic_feature" in target_types) or ("biolink:Disease" in target_types) or ("biolink:PhenotypicFeature" in target_types)): # loop over all pairs of equivalent curies and take the highest probability self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}") max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) if converted_source_curie is None: continue else: preferred_type = converted_source_curie['preferred_type'] if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance": converted_source_curie = converted_source_curie['preferred_curie'] else: continue converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_target_curie is None: continue else: preferred_type = converted_target_curie['preferred_type'] if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature": converted_target_curie = converted_target_curie['preferred_curie'] else: continue if self.use_prob_db is True: probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie) if probability is not None: if np.isfinite(probability): max_probability = probability else: probability = self.pred.prob_single(converted_source_curie, converted_target_curie) if probability is not None: probability = probability[0] if np.isfinite(probability): max_probability = probability # res = list(itertools.product(converted_source_curie, converted_target_curie)) # if len(res) != 0: # all_probabilities = self.pred.prob_all(res) # if isinstance(all_probabilities, list): # max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] elif (("drug" in target_types) or ("chemical_substance" in target_types) or ("biolink:Drug" in target_types) or ("biolink:ChemicalSubstance" in target_types)) and (("disease" in source_types) or ("phenotypic_feature" in source_types) or ("biolink:Disease" in source_types) or ("biolink:PhenotypicFeature" in source_types)): #probability = self.pred.prob_single('ChEMBL:' + target_curie[22:], source_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}") max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) if converted_source_curie is None: continue else: preferred_type = converted_source_curie['preferred_type'] if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature": converted_source_curie = converted_source_curie['preferred_curie'] else: continue converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_target_curie is None: continue else: preferred_type = converted_target_curie['preferred_type'] if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance": converted_target_curie = converted_target_curie['preferred_curie'] else: continue if self.use_prob_db is True: probability = self.pred.get_prob_from_DTD_db(converted_target_curie, converted_source_curie) if probability is not None: if np.isfinite(probability): max_probability = probability else: probability = self.pred.prob_single(converted_target_curie, converted_source_curie) if probability is not None: probability = probability[0] if np.isfinite(probability): max_probability = probability # res = list(itertools.product(converted_target_curie, converted_source_curie)) # if len(res) != 0: # all_probabilities = self.pred.prob_all(res) # if isinstance(all_probabilities, list): # max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability else: continue if value != 0: edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url) # populate the attribute edge.attributes.append(edge_attribute) # append it to the list of attributes except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong adding the drug disease treatment probability") else: self.response.info(f"Drug disease treatment probability successfully added to edges") return self.response
def compute_ngd(self): """ Iterate over all the edges in the knowledge graph, compute the normalized google distance and stick that info on the attributes :default: The default value to set for NGD if it returns a nan :return: response """ if self.response.status != 'OK': # Catches any errors that may have been logged during initialization self._close_database() return self.response parameters = self.parameters self.response.debug(f"Computing NGD") self.response.info(f"Computing the normalized Google distance: weighting edges based on subject/object node " f"co-occurrence frequency in PubMed abstracts") name = "normalized_google_distance" type = "EDAM:data_2526" value = self.parameters['default_value'] url = "https://arax.ncats.io/api/rtx/v1/ui/#/PubmedMeshNgd" qg = self.message.query_graph kg = self.message.knowledge_graph # if you want to add virtual edges, identify the subject/objects, decorate the edges, add them to the KG, and then add one to the QG corresponding to them if 'virtual_relation_label' in parameters: # Figure out which node pairs to compute NGD between subject_qnode_key = parameters['subject_qnode_key'] object_qnode_key = parameters['object_qnode_key'] node_pairs_to_evaluate = ou.get_node_pairs_to_overlay(subject_qnode_key, object_qnode_key, qg, kg, self.response) # Grab PMID lists for all involved nodes involved_curies = {curie for node_pair in node_pairs_to_evaluate for curie in node_pair} canonicalized_curie_lookup = self._get_canonical_curies_map(list(involved_curies)) self.load_curie_to_pmids_data(canonicalized_curie_lookup.values()) added_flag = False # check to see if any edges where added self.response.debug(f"Looping through {len(node_pairs_to_evaluate)} node pairs and calculating NGD values") # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (subject_curie, object_curie) in node_pairs_to_evaluate: # create the edge attribute if it can be canonical_subject_curie = canonicalized_curie_lookup.get(subject_curie, subject_curie) canonical_object_curie = canonicalized_curie_lookup.get(object_curie, object_curie) ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie) if np.isfinite(ngd_value): # if ngd is finite, that's ok, otherwise, stay with default value = ngd_value edge_attribute = EdgeAttribute(type=type, name=name, value=str(value), url=url) # populate the NGD edge attribute pmid_attribute = EdgeAttribute(type="biolink:publications", name="publications", value=[f"PMID:{pmid}" for pmid in pmid_set]) if edge_attribute: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "biolink:has_normalized_google_distance_with" qedge_keys = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute subject_key = subject_curie object_key = object_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" # ensure the id is unique # might need to change after expand is implemented for TRAPI 1.0 while id in self.message.knowledge_graph.edges: id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}" self.global_iter += 1 edge_attribute_list = [ edge_attribute, pmid_attribute, EdgeAttribute(name="is_defined_by", value=is_defined_by, type="ARAX_TYPE_PLACEHOLDER"), EdgeAttribute(name="defined_datetime", value=defined_datetime, type="metatype:Datetime"), EdgeAttribute(name="provided_by", value=provided_by, type="biolink:provided_by"), #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"), #EdgeAttribute(name="weight", value=weight, type="metatype:Float"), #EdgeAttribute(name="qedge_keys", value=qedge_keys) ] # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key, # object_key=object_key, # is_defined_by=is_defined_by, defined_datetime=defined_datetime, # provided_by=provided_by, # confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids) edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation, attributes=edge_attribute_list) edge.qedge_keys = qedge_keys self.message.knowledge_graph.edges[id] = edge # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: #edge_type = parameters['virtual_edge_type'] edge_type = "biolink:has_normalized_google_distance_with" relation = parameters['virtual_relation_label'] option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, qg, self.response) # q_edge = QEdge(id=relation, type=edge_type, relation=relation, # subject_key=subject_qnode_key, object_key=object_qnode_key, # option_group_id=option_group_id) q_edge = QEdge(predicate=edge_type, relation=relation, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) self.message.query_graph.edges[relation]=q_edge self.response.info(f"NGD values successfully added to edges") else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: # Map all nodes to their canonicalized curies in one batch (need canonical IDs for the local NGD system) canonicalized_curie_map = self._get_canonical_curies_map([key for key in self.message.knowledge_graph.nodes.keys()]) self.load_curie_to_pmids_data(canonicalized_curie_map.values()) self.response.debug(f"Looping through edges and calculating NGD values") for edge in self.message.knowledge_graph.edges.values(): # Make sure the attributes are not None if not edge.attributes: edge.attributes = [] # should be an array, but why not a list? # now go and actually get the NGD subject_curie = edge.subject object_curie = edge.object canonical_subject_curie = canonicalized_curie_map.get(subject_curie, subject_curie) canonical_object_curie = canonicalized_curie_map.get(object_curie, object_curie) ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie) if np.isfinite(ngd_value): # if ngd is finite, that's ok, otherwise, stay with default value = ngd_value ngd_edge_attribute = EdgeAttribute(type=type, name=name, value=str(value), url=url) # populate the NGD edge attribute pmid_edge_attribute = EdgeAttribute(type="biolink:publications", name="ngd_publications", value=[f"PMID:{pmid}" for pmid in pmid_set]) edge.attributes.append(ngd_edge_attribute) # append it to the list of attributes edge.attributes.append(pmid_edge_attribute) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong adding the NGD edge attributes") else: self.response.info(f"NGD values successfully added to edges") self._close_database() return self.response
def compute_ngd(self): """ Iterate over all the edges in the knowledge graph, compute the normalized google distance and stick that info on the attributes :default: The default value to set for NGD if it returns a nan :return: response """ if self.response.status != 'OK': # Catches any errors that may have been logged during initialization self._close_database() return self.response parameters = self.parameters self.response.debug(f"Computing NGD") self.response.info(f"Computing the normalized Google distance: weighting edges based on subject/object node " f"co-occurrence frequency in PubMed abstracts") name = "normalized_google_distance" type = "EDAM:data_2526" default_value = self.parameters['default_value'] url = "https://arax.ncats.io/api/rtx/v1/ui/#/PubmedMeshNgd" qg = self.message.query_graph kg = self.message.knowledge_graph ngd_description = """ Normalized google distance is a metric based on edge subject/object node co-occurrence in abstracts of all [PubMed](https://pubmed.ncbi.nlm.nih.gov/) articles. The formula can be found here on [wikipedia.](https://en.wikipedia.org/wiki/Normalized_Google_distance) Where in this case f(x,y) is the number of PubMed abstracts both concepts apear in, f(x)/f(y) are the number of abstracts individual concepts apear in, and N is the number of pubmed articles times the average numbver of search terms per article (27 million * 20). """ # if you want to add virtual edges, identify the subject/objects, decorate the edges, add them to the KG, and then add one to the QG corresponding to them # FW: changing this so if there is a virtual relation label but no subject and object then add edges for all subject object pairs in the quesry graph. if 'subject_qnode_key' not in parameters and 'object_qnode_key' not in parameters and 'virtual_relation_label' in parameters: seen_node_pairs = set() qgraph_edges = copy.deepcopy(list(qg.edges.values())) for query_edge in qgraph_edges: subject_qnode_key = query_edge.subject object_qnode_key = query_edge.object if subject_qnode_key < object_qnode_key: qnode_key_pair = (subject_qnode_key,object_qnode_key) else: qnode_key_pair = (object_qnode_key,subject_qnode_key) # FW: check if we have already added an edge for this pair if qnode_key_pair in seen_node_pairs: pass else: seen_node_pairs.add(qnode_key_pair) # FW: Now add the edge for this qnode pair # FW NOTE: If we decide to keep these changes we should really pull this out into a method as everything after this was copy pasted from below in the 'virtual_relation_label' in parameters section node_pairs_to_evaluate = ou.get_node_pairs_to_overlay(subject_qnode_key, object_qnode_key, qg, kg, self.response) # Grab PMID lists for all involved nodes involved_curies = {curie for node_pair in node_pairs_to_evaluate for curie in node_pair} canonicalized_curie_lookup = self._get_canonical_curies_map(list(involved_curies)) self.load_curie_to_pmids_data(canonicalized_curie_lookup.values()) added_flag = False # check to see if any edges where added self.response.debug(f"Looping through {len(node_pairs_to_evaluate)} node pairs and calculating NGD values") # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (subject_curie, object_curie) in node_pairs_to_evaluate: # create the edge attribute if it can be canonical_subject_curie = canonicalized_curie_lookup.get(subject_curie, subject_curie) canonical_object_curie = canonicalized_curie_lookup.get(object_curie, object_curie) ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie) if np.isfinite(ngd_value): # if ngd is finite, that's ok, otherwise, stay with default edge_value = ngd_value else: edge_value = default_value edge_attribute = EdgeAttribute(attribute_type_id=type, original_attribute_name=name, value=str(edge_value), value_url=url, description=ngd_description) # populate the NGD edge attribute pmid_attribute = EdgeAttribute(attribute_type_id="biolink:publications", original_attribute_name="publications", value=[f"PMID:{pmid}" for pmid in pmid_set]) if edge_attribute: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "biolink:has_normalized_google_distance_with" qedge_keys = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "infores:arax" confidence = None weight = None # TODO: could make the actual value of the attribute subject_key = subject_curie object_key = object_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" # ensure the id is unique # might need to change after expand is implemented for TRAPI 1.0 while id in self.message.knowledge_graph.edges: id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}" self.global_iter += 1 edge_attribute_list = [ edge_attribute, pmid_attribute, EdgeAttribute(original_attribute_name="virtual_relation_label", value=relation, attribute_type_id="biolink:Unknown"), #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"), EdgeAttribute(original_attribute_name="defined_datetime", value=defined_datetime, attribute_type_id="metatype:Datetime"), EdgeAttribute(original_attribute_name="provided_by", value=provided_by, attribute_type_id="biolink:aggregator_knowledge_source", attribute_source=provided_by, value_type_id="biolink:InformationResource"), EdgeAttribute(original_attribute_name=None, value=True, attribute_type_id="biolink:computed_value", attribute_source="infores:arax-reasoner-ara", value_type_id="metatype:Boolean", value_url=None, description="This edge is a container for a computed value between two nodes that is not directly attachable to other edges.") #EdgeAttribute(original_attribute_name="confidence", value=confidence, attribute_type_id="biolink:ConfidenceLevel"), #EdgeAttribute(original_attribute_name="weight", value=weight, attribute_type_id="metatype:Float"), #EdgeAttribute(original_attribute_name="qedge_keys", value=qedge_keys) ] # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key, # object_key=object_key, # is_defined_by=is_defined_by, defined_datetime=defined_datetime, # provided_by=provided_by, # confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids) #### FIXME temporary hack by EWD #edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation, # attributes=edge_attribute_list) edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, attributes=edge_attribute_list) #edge.relation = relation #### /end FIXME edge.qedge_keys = qedge_keys self.message.knowledge_graph.edges[id] = edge #FW: check if results exist then modify them with the ngd edge if self.message.results is not None and len(self.message.results) > 0: ou.update_results_with_overlay_edge(subject_knode_key=subject_key, object_knode_key=object_key, kedge_key=id, message=self.message, log=self.response) # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: #edge_type = parameters['virtual_edge_type'] edge_type = [ "biolink:has_normalized_google_distance_with" ] relation = parameters['virtual_relation_label'] option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, qg, self.response) # q_edge = QEdge(id=relation, type=edge_type, relation=relation, # subject_key=subject_qnode_key, object_key=object_qnode_key, # option_group_id=option_group_id) #### FIXME by EWD. For later fixing #q_edge = QEdge(predicates=edge_type, relation=relation, subject=subject_qnode_key, # object=object_qnode_key, option_group_id=option_group_id) q_edge = QEdge(predicates=edge_type, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) q_edge.relation = relation #### end FIXME self.message.query_graph.edges[relation]=q_edge self.response.info(f"NGD values successfully added to edges for the qnode pair ({subject_qnode_key},{object_qnode_key})") elif 'virtual_relation_label' in parameters: # Figure out which node pairs to compute NGD between subject_qnode_key = parameters['subject_qnode_key'] object_qnode_key = parameters['object_qnode_key'] node_pairs_to_evaluate = ou.get_node_pairs_to_overlay(subject_qnode_key, object_qnode_key, qg, kg, self.response) # Grab PMID lists for all involved nodes involved_curies = {curie for node_pair in node_pairs_to_evaluate for curie in node_pair} canonicalized_curie_lookup = self._get_canonical_curies_map(list(involved_curies)) self.load_curie_to_pmids_data(canonicalized_curie_lookup.values()) added_flag = False # check to see if any edges where added self.response.debug(f"Looping through {len(node_pairs_to_evaluate)} node pairs and calculating NGD values") # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (subject_curie, object_curie) in node_pairs_to_evaluate: # create the edge attribute if it can be canonical_subject_curie = canonicalized_curie_lookup.get(subject_curie, subject_curie) canonical_object_curie = canonicalized_curie_lookup.get(object_curie, object_curie) ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie) if np.isfinite(ngd_value): # if ngd is finite, that's ok, otherwise, stay with default edge_value = ngd_value else: edge_value = default_value edge_attribute = EdgeAttribute(attribute_type_id=type, original_attribute_name=name, value=str(edge_value), value_url=url, description=ngd_description) # populate the NGD edge attribute pmid_attribute = EdgeAttribute(attribute_type_id="biolink:publications", original_attribute_name="publications", value=[f"PMID:{pmid}" for pmid in pmid_set]) if edge_attribute: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "biolink:has_normalized_google_distance_with" qedge_keys = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "infores:arax" confidence = None weight = None # TODO: could make the actual value of the attribute subject_key = subject_curie object_key = object_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" # ensure the id is unique # might need to change after expand is implemented for TRAPI 1.0 while id in self.message.knowledge_graph.edges: id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}" self.global_iter += 1 edge_attribute_list = [ edge_attribute, pmid_attribute, EdgeAttribute(original_attribute_name="virtual_relation_label", value=relation, attribute_type_id="biolink:Unknown"), #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"), EdgeAttribute(original_attribute_name="defined_datetime", value=defined_datetime, attribute_type_id="metatype:Datetime"), EdgeAttribute(original_attribute_name="provided_by", value=provided_by, attribute_type_id="biolink:aggregator_knowledge_source", attribute_source=provided_by, value_type_id="biolink:InformationResource"), EdgeAttribute(original_attribute_name=None, value=True, attribute_type_id="biolink:computed_value", attribute_source="infores:arax-reasoner-ara", value_type_id="metatype:Boolean", value_url=None, description="This edge is a container for a computed value between two nodes that is not directly attachable to other edges.") #EdgeAttribute(original_attribute_name="confidence", value=confidence, attribute_type_id="biolink:ConfidenceLevel"), #EdgeAttribute(original_attribute_name="weight", value=weight, attribute_type_id="metatype:Float"), #EdgeAttribute(original_attribute_name="qedge_keys", value=qedge_keys) ] # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key, # object_key=object_key, # is_defined_by=is_defined_by, defined_datetime=defined_datetime, # provided_by=provided_by, # confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids) #### FIXME temporary hack by EWD #edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation, # attributes=edge_attribute_list) edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, attributes=edge_attribute_list) #edge.relation = relation #### /end FIXME edge.qedge_keys = qedge_keys self.message.knowledge_graph.edges[id] = edge #FW: check if results exist then modify them with the ngd edge if self.message.results is not None and len(self.message.results) > 0: ou.update_results_with_overlay_edge(subject_knode_key=subject_key, object_knode_key=object_key, kedge_key=id, message=self.message, log=self.response) # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: #edge_type = parameters['virtual_edge_type'] edge_type = [ "biolink:has_normalized_google_distance_with" ] relation = parameters['virtual_relation_label'] option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, qg, self.response) # q_edge = QEdge(id=relation, type=edge_type, relation=relation, # subject_key=subject_qnode_key, object_key=object_qnode_key, # option_group_id=option_group_id) #### FIXME by EWD. For later fixing #q_edge = QEdge(predicates=edge_type, relation=relation, subject=subject_qnode_key, # object=object_qnode_key, option_group_id=option_group_id) q_edge = QEdge(predicates=edge_type, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) q_edge.relation = relation #### end FIXME self.message.query_graph.edges[relation]=q_edge self.response.info(f"NGD values successfully added to edges") else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: # Map all nodes to their canonicalized curies in one batch (need canonical IDs for the local NGD system) canonicalized_curie_map = self._get_canonical_curies_map([key for key in self.message.knowledge_graph.nodes.keys()]) self.load_curie_to_pmids_data(canonicalized_curie_map.values()) self.response.debug(f"Looping through edges and calculating NGD values") for edge in self.message.knowledge_graph.edges.values(): # Make sure the attributes are not None if not edge.attributes: edge.attributes = [] # should be an array, but why not a list? # now go and actually get the NGD subject_curie = edge.subject object_curie = edge.object canonical_subject_curie = canonicalized_curie_map.get(subject_curie, subject_curie) canonical_object_curie = canonicalized_curie_map.get(object_curie, object_curie) ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie) if np.isfinite(ngd_value): # if ngd is finite, that's ok, otherwise, stay with default edge_value = ngd_value else: edge_value = default_value ngd_edge_attribute = EdgeAttribute(attribute_type_id=type, original_attribute_name=name, value=str(edge_value), value_url=url, description=ngd_description) # populate the NGD edge attribute pmid_edge_attribute = EdgeAttribute(attribute_type_id="biolink:publications", original_attribute_name="ngd_publications", value_type_id="EDAM:data_1187", value=[f"PMID:{pmid}" for pmid in pmid_set]) edge.attributes.append(ngd_edge_attribute) # append it to the list of attributes edge.attributes.append(pmid_edge_attribute) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong adding the NGD edge attributes") else: self.response.info(f"NGD values successfully added to edges") self._close_database() return self.response