def _create_icees_virtual_edge(self, subject_curie, object_curie, p_value): id = f"ICEES:{subject_curie}--{object_curie}" # edge = Edge(id=f"ICEES:{subject_curie}--{object_curie}", # type=self.icees_edge_type, # subject_key=subject_curie, # object_key=object_curie, # is_defined_by="ARAX", # provided_by="ICEES+", # relation=self.virtual_relation_label, # qedge_ids=[self.virtual_relation_label], # attributes=[self._create_icees_edge_attribute(p_value)]) edge_attribute_list = [ self._create_icees_edge_attribute(p_value), EdgeAttribute(name="is_defined_by", value="ARAX", type="ARAX_TYPE_PLACEHOLDER"), EdgeAttribute(name="provided_by", value="ICEES+", type="biolink:provided_by"), #EdgeAttribute(name="qedge_ids", value=[self.virtual_relation_label]) ] edge = Edge(predicate=self.icees_edge_type, subject=subject_curie, object=object_curie, relation=self.virtual_relation_label, attributes=edge_attribute_list) edge.qedge_keys = [self.virtual_relation_label] return id, edge
def add_edge(self, source_id, target_id, type, batch_id, connections=None): """adds a new edge to the knowledge graph of the response updated for trapi v1.0.0""" # print("edge connection: {}".format(connections)) if (source_id, target_id) not in self.edges: # set the id of the edge; this will be the key in the knowledge graph edge_id = 'e' + str(len(self.knowledge_graph.edges)) + '-' + batch_id # add in the attributes attribute_list = self.pull_attributes(connections) # add the relation relation = None if connections is not None and 'relation' in connections: relation = connections.get('relation') #if relation is not None: #print("edge relation: {}".format(relation)) # create the edge object # edge = Edge(id=id, type=type, source_id=source_id, target_id=target_id) # TODO - add 'relation' -> single string from the connections[0] object edge = Edge(predicate=translate_type(type, False), subject=source_id, object=target_id, attributes=attribute_list, relation=relation) edge.id = edge_id # added for trapi v1.0.0 # add the edge to the graph and results collections # self.knowledge_graph.edges.append(edge) self.knowledge_graph.edges[edge_id] = edge self.edges[(source_id, target_id)] = edge # return return edge return self.edges[(source_id, target_id)]
def _create_swagger_edge_from_kp_edge(self, kp_edge_key: str, kp_edge: Dict[str, any]) -> Edge: swagger_edge = Edge(subject=kp_edge['subject'], object=kp_edge['object'], predicate=kp_edge['predicate']) swagger_edge.attributes = [ Attribute(name="provided_by", value=self.kp_name, type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="ARAX", type=eu.get_attribute_type("is_defined_by")) ] return kp_edge_key, swagger_edge
def resultGraph(graph): nodes = [] for node in graph.nodes(data=True): node_attributes = [ NodeAttribute(name=key, value=str(value)) for key, value in node[1].items() if key not in ['labels', 'name'] ] nodes.append( Node(id=str(node[0]), type=[str(label) for label in node[1]['labels']], name=str(node[1]['name']), node_attributes=node_attributes)) edges = [] for edge in graph.edges(data=True): edge_attributes = [ EdgeAttribute(name=key, value=str(value)) for key, value in edge[2].items() if key not in ['type', 'source', 'id'] ] if 'source' not in edge[2]: edge[2]['source'] = "NA" edges.append( Edge(type=edge[2]['type'], source_id=str(edge[0]), target_id=str(edge[1]), provided_by=str(edge[2]['source']), id=str(edge[2]['id']), edge_attributes=edge_attributes)) rg = KnowledgeGraph(nodes=nodes, edges=edges) return (rg)
def _create_icees_virtual_edge(self, subject_curie, object_curie, p_value): id = f"ICEES:{subject_curie}--{object_curie}" # edge = Edge(id=f"ICEES:{subject_curie}--{object_curie}", # type=self.icees_edge_type, # subject_key=subject_curie, # object_key=object_curie, # is_defined_by="ARAX", # provided_by="ICEES+", # relation=self.virtual_relation_label, # qedge_ids=[self.virtual_relation_label], # attributes=[self._create_icees_edge_attribute(p_value)]) provided_by = "infores:icees" edge_attribute_list = [ self._create_icees_edge_attribute(p_value), EdgeAttribute(original_attribute_name="virtual_relation_label", value=self.virtual_relation_label, attribute_type_id="biolink:Unknown"), #EdgeAttribute(original_attribute_name="is_defined_by", value="ARAX", attribute_type_id="biolink:Unknown"), EdgeAttribute( original_attribute_name="provided_by", value=provided_by, attribute_type_id="biolink:aggregator_knowledge_source", attribute_source=provided_by, value_type_id="biolink:InformationResource"), EdgeAttribute( original_attribute_name=None, value=True, attribute_type_id="biolink:computed_value", attribute_source="infores:arax-reasoner-ara", value_type_id="metatype:Boolean", value_url=None, description= "This edge is a container for a computed value between two nodes that is not directly attachable to other edges." ) #EdgeAttribute(name="qedge_ids", value=[self.virtual_relation_label]) ] edge = Edge(predicate=self.icees_edge_type, subject=subject_curie, object=object_curie, attributes=edge_attribute_list) edge.qedge_keys = [self.virtual_relation_label] return id, edge
def _create_ngd_edge(self, ngd_value: float, subject: str, object: str, pmid_list: list) -> Tuple[str, Edge]: ngd_edge = Edge() ngd_edge.predicate = self.ngd_edge_type ngd_edge.subject = subject ngd_edge.object = object ngd_edge_key = f"NGD:{subject}--{ngd_edge.predicate}--{object}" ngd_edge.attributes = [ Attribute(name=self.ngd_edge_attribute_name, type=self.ngd_edge_attribute_type, value=ngd_value, url=self.ngd_edge_attribute_url) ] ngd_edge.attributes += [ Attribute(name="provided_by", value="ARAX", type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="ARAX", type=eu.get_attribute_type("is_defined_by")), Attribute(name="publications", value=pmid_list, type=eu.get_attribute_type("publications")) ] return ngd_edge_key, ngd_edge
def _create_swagger_edge_from_kp_edge( self, kp_edge: Dict[str, any]) -> Tuple[str, Edge]: swagger_edge = Edge(subject=kp_edge['source_id'], object=kp_edge['target_id'], predicate=kp_edge['type']) swagger_edge.attributes = [ Attribute(name="provided_by", value=self.kp_name, type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="ARAX", type=eu.get_attribute_type("is_defined_by")) ] score_name = kp_edge['score_name'] score_value = kp_edge.get('score') if score_value: # Some returned edges are missing a score value for whatever reason swagger_edge.attributes.append( Attribute(name=score_name, type=self.score_type_lookup.get( score_name, "biolink:Unknown"), value=score_value)) return kp_edge['id'], swagger_edge
def _convert_kg2c_plover_edge_to_trapi_edge(self, edge_tuple: list) -> Edge: edge = Edge(subject=edge_tuple[0], object=edge_tuple[1], predicate=edge_tuple[2], attributes=[]) knowledge_sources = edge_tuple[3] # Indicate that this edge came from the KG2 KP edge.attributes.append(Attribute(attribute_type_id="biolink:aggregator_knowledge_source", value=self.kg2_infores_curie, value_type_id="biolink:InformationResource", attribute_source=self.kg2_infores_curie)) # Create knowledge source attributes for each of this edge's knowledge sources knowledge_source_attributes = [Attribute(attribute_type_id="biolink:knowledge_source", value=infores_curie, value_type_id="biolink:InformationResource", attribute_source=self.kg2_infores_curie) for infores_curie in knowledge_sources] edge.attributes += knowledge_source_attributes return edge
def _add_answers_to_kg(self, answer_kg: QGOrganizedKnowledgeGraph, reasoner_std_response: Dict[str, any], input_qnode_key: str, output_qnode_key: str, qedge_key: str, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: kg_to_qg_ids_dict = self._build_kg_to_qg_id_dict(reasoner_std_response['results']) if reasoner_std_response['knowledge_graph']['edges']: remapped_node_keys = dict() log.debug(f"Got results back from BTE for this query " f"({len(reasoner_std_response['knowledge_graph']['edges'])} edges)") for node in reasoner_std_response['knowledge_graph']['nodes']: swagger_node = Node() bte_node_key = node.get('id') swagger_node.name = node.get('name') swagger_node.category = eu.convert_to_list(eu.convert_string_to_snake_case(node.get('type'))) # Map the returned BTE qg_ids back to the original qnode_keys in our query graph bte_qg_id = kg_to_qg_ids_dict['nodes'].get(bte_node_key) if bte_qg_id == "n0": qnode_key = input_qnode_key elif bte_qg_id == "n1": qnode_key = output_qnode_key else: log.error("Could not map BTE qg_id to ARAX qnode_key", error_code="UnknownQGID") return answer_kg # Find and use the preferred equivalent identifier for this node (if it's an output node) if qnode_key == output_qnode_key: if bte_node_key in remapped_node_keys: swagger_node_key = remapped_node_keys.get(bte_node_key) else: equivalent_curies = [f"{prefix}:{eu.get_curie_local_id(local_id)}" for prefix, local_ids in node.get('equivalent_identifiers').items() for local_id in local_ids] swagger_node_key = self._get_best_equivalent_bte_curie(equivalent_curies, swagger_node.category[0]) remapped_node_keys[bte_node_key] = swagger_node_key else: swagger_node_key = bte_node_key answer_kg.add_node(swagger_node_key, swagger_node, qnode_key) for edge in reasoner_std_response['knowledge_graph']['edges']: swagger_edge = Edge() swagger_edge_key = edge.get("id") swagger_edge.predicate = edge.get('type') swagger_edge.subject = remapped_node_keys.get(edge.get('source_id'), edge.get('source_id')) swagger_edge.object = remapped_node_keys.get(edge.get('target_id'), edge.get('target_id')) swagger_edge.attributes = [Attribute(name="provided_by", value=edge.get('edge_source'), type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="BTE", type=eu.get_attribute_type("is_defined_by"))] # Map the returned BTE qg_id back to the original qedge_key in our query graph bte_qg_id = kg_to_qg_ids_dict['edges'].get(swagger_edge_key) if bte_qg_id != "e1": log.error("Could not map BTE qg_id to ARAX qedge_key", error_code="UnknownQGID") return answer_kg answer_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) return answer_kg
def _convert_kg2c_edge_to_swagger_edge( self, neo4j_edge: Dict[str, any]) -> Tuple[str, Edge]: swagger_edge = Edge() swagger_edge_key = f"KG2c:{neo4j_edge.get('id')}" swagger_edge.predicate = neo4j_edge.get("simplified_edge_label") swagger_edge.subject = neo4j_edge.get("subject") swagger_edge.object = neo4j_edge.get("object") other_properties = ["provided_by", "publications"] swagger_edge.attributes = self._create_swagger_attributes( other_properties, neo4j_edge) is_defined_by_attribute = Attribute( name="is_defined_by", value="ARAX/KG2c", type=eu.get_attribute_type("is_defined_by")) swagger_edge.attributes.append(is_defined_by_attribute) return swagger_edge_key, swagger_edge
def _convert_kg1_edge_to_swagger_edge( self, neo4j_edge: Dict[str, any], node_uuid_to_curie_dict: Dict[str, str]) -> Tuple[str, Edge]: swagger_edge = Edge() swagger_edge_key = f"KG1:{neo4j_edge.get('id')}" swagger_edge.predicate = neo4j_edge.get("predicate") swagger_edge.subject = node_uuid_to_curie_dict[neo4j_edge.get( "source_node_uuid")] swagger_edge.object = node_uuid_to_curie_dict[neo4j_edge.get( "target_node_uuid")] swagger_edge.relation = neo4j_edge.get("relation") other_properties = ["provided_by", "probability"] swagger_edge.attributes = self._create_swagger_attributes( other_properties, neo4j_edge) is_defined_by_attribute = Attribute( name="is_defined_by", value="ARAX/KG1", type=eu.get_attribute_type("is_defined_by")) swagger_edge.attributes.append(is_defined_by_attribute) return swagger_edge_key, swagger_edge
def _convert_kg2_edge_to_swagger_edge(self, neo4j_edge: Dict[str, any]) -> Edge: swagger_edge = Edge() swagger_edge_key = f"KG2:{neo4j_edge.get('id')}" swagger_edge.predicate = neo4j_edge.get("simplified_edge_label") swagger_edge.subject = neo4j_edge.get("subject") swagger_edge.object = neo4j_edge.get("object") swagger_edge.relation = neo4j_edge.get("relation") # Add additional properties on KG2 edges as swagger Attribute objects other_properties = [ "provided_by", "publications", "negated", "relation_curie", "simplified_relation_curie", "simplified_relation", "edge_label" ] swagger_edge.attributes = self._create_swagger_attributes( other_properties, neo4j_edge) is_defined_by_attribute = Attribute( name="is_defined_by", value="ARAX/KG2", type=eu.get_attribute_type("is_defined_by")) swagger_edge.attributes.append(is_defined_by_attribute) return swagger_edge_key, swagger_edge
def _convert_to_swagger_edge(self, subject: str, object: str, name: str, value: float) -> Tuple[str, Edge]: swagger_edge = Edge() swagger_edge.predicate = f"biolink:{name}" swagger_edge.subject = subject swagger_edge.object = object swagger_edge_key = f"CHP:{subject}-{name}-{object}" swagger_edge.relation = None type = "EDAM:data_0951" url = "https://github.com/di2ag/chp_client" swagger_edge.attributes = [ Attribute(type=type, name=name, value=str(value), url=url), Attribute(name="provided_by", value=self.kp_name, type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="ARAX", type=eu.get_attribute_type("is_defined_by")) ] return swagger_edge_key, swagger_edge
def _create_ngd_edge(self, ngd_value: float, subject: str, object: str, pmid_list: list) -> Tuple[str, Edge]: ngd_edge = Edge() ngd_edge.predicate = self.ngd_edge_predicate ngd_edge.subject = subject ngd_edge.object = object ngd_edge_key = f"NGD:{subject}--{ngd_edge.predicate}--{object}" ngd_edge.attributes = [ Attribute(original_attribute_name=self.ngd_edge_attribute_name, attribute_type_id=self.ngd_edge_attribute_type, value=ngd_value) ] kp_description = "ARAX's in-house normalized google distance database." ngd_edge.attributes += [ self.decorator.create_attribute("publications", pmid_list), eu.get_kp_source_attribute( "infores:arax-normalized-google-distance", arax_kp=True, description=kp_description), eu.get_arax_source_attribute(), eu.get_computed_value_attribute() ] return ngd_edge_key, ngd_edge
def predict_drug_treats_disease(self): """ Iterate over all the edges in the knowledge graph, add the drug-disease treatment probability for appropriate edges on the edge_attributes :return: response """ parameters = self.parameters self.response.debug(f"Computing drug disease treatment probability based on a machine learning model") self.response.info(f"Computing drug disease treatment probability based on a machine learning model: See [this publication](https://doi.org/10.1101/765305) for more details about how this is accomplished.") attribute_name = "probability_treats" attribute_type = "EDAM:data_0951" value = 0 # this will be the default value. If the model returns 0, or the default is there, don't include that edge url = "https://doi.org/10.1101/765305" # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them if 'virtual_relation_label' in parameters: source_curies_to_decorate = set() target_curies_to_decorate = set() curie_to_name = dict() # identify the nodes that we should be adding virtual edges for for node_key, node in self.message.knowledge_graph.nodes.items(): if hasattr(node, 'qnode_keys'): if parameters['subject_qnode_key'] in node.qnode_keys: if "drug" in node.category or "chemical_substance" in node.category or "biolink:Drug" in node.category or "biolink:ChemicalSubstance" in node.category: # this is now NOT checked by ARAX_overlay source_curies_to_decorate.add(node_key) curie_to_name[node_key] = node.name if parameters['object_qnode_key'] in node.qnode_keys: if "disease" in node.category or "phenotypic_feature" in node.category or "biolink:Disease" in node.category or "biolink:PhenotypicFeature" in node.category: # this is now NOT checked by ARAX_overlay target_curies_to_decorate.add(node_key) curie_to_name[node_key] = node.name added_flag = False # check to see if any edges where added # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate): self.response.debug(f"Predicting probability that {curie_to_name[source_curie]} treats {curie_to_name[target_curie]}") # create the edge attribute if it can be # loop over all equivalent curies and take the highest probability max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) if converted_source_curie is None: continue else: preferred_type = converted_source_curie['preferred_type'] if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance": converted_source_curie = converted_source_curie['preferred_curie'] else: continue converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_target_curie is None: continue else: preferred_type = converted_target_curie['preferred_type'] if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature": converted_target_curie = converted_target_curie['preferred_curie'] else: continue if self.use_prob_db is True: probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie) if probability is not None: if np.isfinite(probability): max_probability = probability else: probability = self.pred.prob_single(converted_source_curie, converted_target_curie) if probability is not None: probability = probability[0] if np.isfinite(probability): max_probability = probability # if len(res) != 0: # all_probabilities = self.pred.prob_all(res) # if isinstance(all_probabilities, list): # max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url) # populate the edge attribute if edge_attribute and value != 0: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "biolink:probably_treats" qedge_keys = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute subject_key = source_curie object_key = target_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" self.global_iter += 1 edge_attribute_list = [ edge_attribute, EdgeAttribute(name="is_defined_by", value=is_defined_by, type="ARAX_TYPE_PLACEHOLDER"), EdgeAttribute(name="defined_datetime", value=defined_datetime, type="metatype:Datetime"), EdgeAttribute(name="provided_by", value=provided_by, type="biolink:provided_by"), #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"), #EdgeAttribute(name="weight", value=weight, type="metatype:Float") ] edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation, attributes=edge_attribute_list) edge.qedge_keys = qedge_keys self.message.knowledge_graph.edges[id] = edge # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: edge_type = "biolink:probably_treats" relation = parameters['virtual_relation_label'] subject_qnode_key = parameters['subject_qnode_key'] object_qnode_key = parameters['object_qnode_key'] option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, self.message.query_graph, self.response) q_edge = QEdge(predicate=edge_type, relation=relation, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) self.message.query_graph.edges[relation] = q_edge return self.response else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: # map curies to types curie_to_type = dict() curie_to_name = dict() for node_key, node in self.message.knowledge_graph.nodes.items(): curie_to_type[node_key] = node.category curie_to_name[node_key] = node.name # then iterate over the edges and decorate if appropriate for edge_key, edge in self.message.knowledge_graph.edges.items(): # Make sure the edge_attributes are not None if not edge.attributes: edge.attributes = [] # should be an array, but why not a list? # now go and actually get the probability source_curie = edge.subject target_curie = edge.object source_types = curie_to_type[source_curie] target_types = curie_to_type[target_curie] if (("drug" in source_types) or ("chemical_substance" in source_types) or ("biolink:Drug" in source_types) or ("biolink:ChemicalSubstance" in source_types)) and (("disease" in target_types) or ("phenotypic_feature" in target_types) or ("biolink:Disease" in target_types) or ("biolink:PhenotypicFeature" in target_types)): # loop over all pairs of equivalent curies and take the highest probability self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}") max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) if converted_source_curie is None: continue else: preferred_type = converted_source_curie['preferred_type'] if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance": converted_source_curie = converted_source_curie['preferred_curie'] else: continue converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_target_curie is None: continue else: preferred_type = converted_target_curie['preferred_type'] if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature": converted_target_curie = converted_target_curie['preferred_curie'] else: continue if self.use_prob_db is True: probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie) if probability is not None: if np.isfinite(probability): max_probability = probability else: probability = self.pred.prob_single(converted_source_curie, converted_target_curie) if probability is not None: probability = probability[0] if np.isfinite(probability): max_probability = probability # res = list(itertools.product(converted_source_curie, converted_target_curie)) # if len(res) != 0: # all_probabilities = self.pred.prob_all(res) # if isinstance(all_probabilities, list): # max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] elif (("drug" in target_types) or ("chemical_substance" in target_types) or ("biolink:Drug" in target_types) or ("biolink:ChemicalSubstance" in target_types)) and (("disease" in source_types) or ("phenotypic_feature" in source_types) or ("biolink:Disease" in source_types) or ("biolink:PhenotypicFeature" in source_types)): #probability = self.pred.prob_single('ChEMBL:' + target_curie[22:], source_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}") max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) if converted_source_curie is None: continue else: preferred_type = converted_source_curie['preferred_type'] if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature": converted_source_curie = converted_source_curie['preferred_curie'] else: continue converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_target_curie is None: continue else: preferred_type = converted_target_curie['preferred_type'] if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance": converted_target_curie = converted_target_curie['preferred_curie'] else: continue if self.use_prob_db is True: probability = self.pred.get_prob_from_DTD_db(converted_target_curie, converted_source_curie) if probability is not None: if np.isfinite(probability): max_probability = probability else: probability = self.pred.prob_single(converted_target_curie, converted_source_curie) if probability is not None: probability = probability[0] if np.isfinite(probability): max_probability = probability # res = list(itertools.product(converted_target_curie, converted_source_curie)) # if len(res) != 0: # all_probabilities = self.pred.prob_all(res) # if isinstance(all_probabilities, list): # max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability else: continue if value != 0: edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url) # populate the attribute edge.attributes.append(edge_attribute) # append it to the list of attributes except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong adding the drug disease treatment probability") else: self.response.info(f"Drug disease treatment probability successfully added to edges") return self.response
def fisher_exact_test(self): """ Peform the fisher's exact test to expand or decorate the knowledge graph :return: response """ self.response.info(f"Performing Fisher's Exact Test to add p-value to edge attribute of virtual edge") # check the input parameters if 'subject_qnode_key' not in self.parameters: self.response.error(f"The argument 'subject_qnode_key' is required for fisher_exact_test function") return self.response else: subject_qnode_key = self.parameters['subject_qnode_key'] if 'virtual_relation_label' not in self.parameters: self.response.error(f"The argument 'virtual_relation_label' is required for fisher_exact_test function") return self.response else: virtual_relation_label = str(self.parameters['virtual_relation_label']) if 'object_qnode_key' not in self.parameters: self.response.error(f"The argument 'object_qnode_key' is required for fisher_exact_test function") return self.response else: object_qnode_key = self.parameters['object_qnode_key'] rel_edge_key = self.parameters['rel_edge_key'] if 'rel_edge_key' in self.parameters else None top_n = int(self.parameters['top_n']) if 'top_n' in self.parameters else None cutoff = float(self.parameters['cutoff']) if 'cutoff' in self.parameters else None ## check if the new model files exists in /predictor/retrain_data. If not, scp it from arax.ncats.io pathlist = os.path.realpath(__file__).split(os.path.sep) RTXindex = pathlist.index("RTX") filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources', 'KG2c']) ## check if there is kg2c.sqlite sqlite_name = RTXConfig.kg2c_sqlite_path.split("/")[-1] sqlite_file_path = f"{filepath}{os.path.sep}{sqlite_name}" if os.path.exists(sqlite_file_path): pass else: os.system(f"scp {RTXConfig.kg2c_sqlite_username}@{RTXConfig.kg2c_sqlite_host}:{RTXConfig.kg2c_sqlite_path} {sqlite_file_path}") self.sqlite_file_path = sqlite_file_path if rel_edge_key is not None: self.response.warning(f"The 'rel_edge_key' option in FET is specified, it will cause slow for the calculation of FEST test.") # initialize some variables nodes_info = {} edge_expand_kp = [] subject_node_list = [] object_node_dict = {} size_of_object = {} subject_node_exist = False object_node_exist = False query_edge_key = set() rel_edge_type = set() subject_node_category = None object_node_category = None ## Check if subject_qnode_key and object_qnode_key are in the Query Graph try: if len(self.message.query_graph.nodes) != 0: for node_key in self.message.query_graph.nodes: if node_key == subject_qnode_key: subject_node_exist = True subject_node_category = self.message.query_graph.nodes[node_key].categories elif node_key == object_qnode_key: object_node_exist = True object_node_category = self.message.query_graph.nodes[node_key].categories else: pass else: self.response.error(f"There is no query node in QG") return self.response except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong with retrieving nodes in message QG") return self.response if subject_node_exist: if object_node_exist: pass else: self.response.error(f"No query node with object qnode key {object_qnode_key} detected in QG for Fisher's Exact Test") return self.response else: self.response.error(f"No query node with subject qnode key {subject_qnode_key} detected in QG for Fisher's Exact Test") return self.response ## Check if there is a query edge connected to both subject_qnode_key and object_qnode_key in the Query Graph try: if len(self.message.query_graph.edges) != 0: for edge_key in self.message.query_graph.edges: qedge_relation = None if hasattr(self.message.query_graph.edges[edge_key], "relation"): qedge_relation = self.message.query_graph.edges[edge_key].relation if self.message.query_graph.edges[edge_key].subject == subject_qnode_key and self.message.query_graph.edges[edge_key].object == object_qnode_key and qedge_relation == None: query_edge_key.update([edge_key]) # only actual query edge is added elif self.message.query_graph.edges[edge_key].subject == object_qnode_key and self.message.query_graph.edges[edge_key].object == subject_qnode_key and qedge_relation == None: query_edge_key.update([edge_key]) # only actual query edge is added else: continue else: self.response.error(f"There is no query edge in Query Graph") return self.response except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong with retrieving edges in message QG") return self.response if len(query_edge_key)!=0: if rel_edge_key: if rel_edge_key in query_edge_key: pass else: self.response.error(f"No query edge with qedge key {rel_edge_key} connected to both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key} detected in QG for Fisher's Exact Test") return self.response else: pass else: self.response.error( f"No query edge connected to both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key} detected in QG for Fisher's Exact Test") return self.response ## loop over all nodes in KG and collect their node information try: for node_key, node in self.message.knowledge_graph.nodes.items(): nodes_info[node_key] = {'qnode_keys': node.qnode_keys, 'category': self.message.knowledge_graph.nodes[node_key].categories[0]} except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong with retrieving nodes in message KG") return self.response ## loop over all edges in KG and create subject node list and target node dict based on subject_qnode_key, object_qnode_key as well as rel_edge_id (optional, otherwise all edges are considered) try: for edge_key, edge in self.message.knowledge_graph.edges.items(): edge_attribute_list = [x.value for x in self.message.knowledge_graph.edges[edge_key].attributes if x.original_attribute_name == 'is_defined_by'] if len(edge_attribute_list) == 0: ## Collect all knowldge source information for each edge between queried qnode_keys (eg. 'n01', 'n02') temp_kp = [] for x in self.message.knowledge_graph.edges[edge_key].attributes: if x.attribute_type_id == 'biolink:aggregator_knowledge_source' or x.attribute_type_id == 'biolink:knowledge_source': temp_kp += self._change_kp_name(x.value) if 'arax' in temp_kp: temp_kp.remove('arax') if rel_edge_key: if rel_edge_key in edge.qedge_keys: if subject_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].subject]['qnode_keys']: edge_expand_kp.extend(temp_kp) rel_edge_type.update([self.message.knowledge_graph.edges[edge_key].predicate]) subject_node_list.append(self.message.knowledge_graph.edges[edge_key].subject) if self.message.knowledge_graph.edges[edge_key].object not in object_node_dict.keys(): object_node_dict[self.message.knowledge_graph.edges[edge_key].object] = {self.message.knowledge_graph.edges[edge_key].subject} else: object_node_dict[self.message.knowledge_graph.edges[edge_key].object].update([self.message.knowledge_graph.edges[edge_key].subject]) else: edge_expand_kp.extend(temp_kp) rel_edge_type.update([self.message.knowledge_graph.edges[edge_key].predicate]) subject_node_list.append(self.message.knowledge_graph.edges[edge_key].object) if self.message.knowledge_graph.edges[edge_key].subject not in object_node_dict.keys(): object_node_dict[self.message.knowledge_graph.edges[edge_key].subject] = {self.message.knowledge_graph.edges[edge_key].object} else: object_node_dict[self.message.knowledge_graph.edges[edge_key].subject].update([self.message.knowledge_graph.edges[edge_key].object]) else: if subject_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].subject]['qnode_keys']: if object_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].object]['qnode_keys']: edge_expand_kp.extend(temp_kp) subject_node_list.append(self.message.knowledge_graph.edges[edge_key].subject) if self.message.knowledge_graph.edges[edge_key].object not in object_node_dict.keys(): object_node_dict[self.message.knowledge_graph.edges[edge_key].object] = {self.message.knowledge_graph.edges[edge_key].subject} else: object_node_dict[self.message.knowledge_graph.edges[edge_key].object].update([self.message.knowledge_graph.edges[edge_key].subject]) elif object_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].subject]['qnode_keys']: if subject_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].object]['qnode_keys']: edge_expand_kp.extend(temp_kp) subject_node_list.append(self.message.knowledge_graph.edges[edge_key].object) if self.message.knowledge_graph.edges[edge_key].subject not in object_node_dict.keys(): object_node_dict[self.message.knowledge_graph.edges[edge_key].subject] = {self.message.knowledge_graph.edges[edge_key].object} else: object_node_dict[self.message.knowledge_graph.edges[edge_key].subject].update([self.message.knowledge_graph.edges[edge_key].object]) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong with retrieving edges in message KG") return self.response subject_node_list = list(set(subject_node_list)) ## remove the duplicate subject node key ## check if there is no subject node in message KG if len(subject_node_list) == 0: self.response.error(f"No subject node found in message KG for Fisher's Exact Test") return self.response ## check if there is no object node in message KG if len(object_node_dict) == 0: self.response.error(f"No object node found in message KG for Fisher's Exact Test") return self.response ## check if subject node has more than one type. If so, throw an error if subject_node_category is None: self.response.error(f"Subject node with qnode key {subject_qnode_key} was set to None in Query Graph. Please specify the node type") return self.response ## check if object node has more than one type. If so, throw an error if object_node_category is None: self.response.error(f"Object node with qnode key {object_qnode_key} was set to None in Query Graph. Please specify the node type") return self.response else: pass ##check how many kps were used in message KG. If more than one, the one with the max number of edges connnected to both subject nodes and object nodes was used if len(collections.Counter(edge_expand_kp))==1: kp = edge_expand_kp[0] else: occurrences = collections.Counter(edge_expand_kp) max_index = max([(value, index) for index, value in enumerate(occurrences.values())])[1] # if there are more than one kp having the maximum number of edges, then the last one based on alphabetical order will be chosen. kp = list(occurrences.keys())[max_index] self.response.debug(f"{occurrences}") self.response.warning(f"More than one knowledge provider were detected to be used for expanding the edges connected to both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key}") self.response.warning(f"The knowledge provider {kp} was used to calculate Fisher's exact test because it has the maximum number of edges connected to both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key}") ## check if kp is "ARAX/KG1" or "infores:rtx-kg2", if not, report error if kp == "rtx_kg1_kp": kp = 'ARAX/KG1' elif kp == "rtx-kg2": kp = 'infores:rtx-kg2' else: kp = 'infores:rtx-kg2' self.response.warning(f"There is more than one knowledge source for the edges between the subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key} and most of them are from {kp}. The infores:rtx-kg2 is still used to calculate Fisher's exact test.") if kp == 'ARAX/KG1': ## This warning can be removed once KG1 is deprecated self.response.warning(f"Since KG1 will be deprecated soon and the total count of nodes is based on kg2c, currently querying with 'expand(kp=ARAX/KG1)' might cause little discrepancy for FET probability.") ## Print out some information used to calculate FET if len(subject_node_list) == 1: self.response.debug(f"{len(subject_node_list)} subject node with qnode key {subject_qnode_key} and node type {subject_node_category[0]} was found in message KG and used to calculate Fisher's Exact Test") else: self.response.debug(f"{len(subject_node_list)} subject nodes with qnode key {subject_qnode_key} and node type {subject_node_category[0]} was found in message KG and used to calculate Fisher's Exact Test") if len(object_node_dict) == 1: self.response.debug(f"{len(object_node_dict)} object node with qnode key {object_qnode_key} and node type {object_node_category[0]} was found in message KG and used to calculate Fisher's Exact Test") else: self.response.debug(f"{len(object_node_dict)} object nodes with qnode key {object_qnode_key} and node type {object_node_category[0]} was found in message KG and used to calculate Fisher's Exact Test") # find all nodes with the same type of 'subject_qnode_key' nodes in specified KP ('ARAX/KG1','infores:rtx-kg2') that are adjacent to target nodes # if rel_edge_key is not None, query adjacent node from database otherwise query adjacent node with DSL command by providing a list of query nodes to add_qnode() ## Note: Regarding of whether kp='ARAX/KG1' or kp='infores:rtx-kg2', it will always query adjacent node count based on kg2c if rel_edge_key: if len(rel_edge_type) == 1: # if the edge with rel_edge_key has only type, we use this rel_edge_predicate to find all subject nodes in KP self.response.debug(f"{kp} and edge relation type {list(rel_edge_type)[0]} were used to calculate total object nodes in Fisher's Exact Test") result = self.query_size_of_adjacent_nodes(node_curie=list(object_node_dict.keys()), source_type=object_node_category[0], adjacent_type=subject_node_category[0], kp=kp, rel_type=list(rel_edge_type)[0]) else: # if the edge with rel_edge_key has more than one type or no edge, we ignore the edge predicate and use all categories to find all subject nodes in KP if len(rel_edge_key) == 0: self.response.warning(f"The edges with specified qedge key {rel_edge_key} have no category, we ignore the edge predicate and use all categories to calculate Fisher's Exact Test") else: self.response.warning(f"The edges with specified qedge key {rel_edge_key} have more than one category, we ignore the edge predicate and use all categories to calculate Fisher's Exact Test") self.response.debug(f"infores:rtx-kg2 was used to calculate total object nodes in Fisher's Exact Test") result = self.query_size_of_adjacent_nodes(node_curie=list(object_node_dict.keys()), source_type=object_node_category[0], adjacent_type=subject_node_category[0], kp='infores:rtx-kg2', rel_type=None) else: # if no rel_edge_key is specified, we ignore the edge predicate and use all categories to find all subject nodes in KP self.response.debug(f"infores:rtx-kg2 was used to calculate total object nodes in Fisher's Exact Test") result = self.query_size_of_adjacent_nodes(node_curie=list(object_node_dict.keys()), source_type=object_node_category[0], adjacent_type=subject_node_category[0], kp='infores:rtx-kg2', rel_type=None) if result is None: return self.response ## Something wrong happened for querying the adjacent nodes else: res, removed_nodes = result if len(removed_nodes)==0: size_of_object = res else: if len(removed_nodes) == 1: self.response.warning(f"One object node which is {removed_nodes[0]} can't find its neighbors. This node will be ignored for FET calculation.") else: self.response.warning(f"{len(removed_nodes)} object nodes which are {removed_nodes} can't find its neighbors. These nodes will be ignored for FET calculation.") for node in removed_nodes: del object_node_dict[node] size_of_object = res if len(object_node_dict) != 0: ## Based on KP detected in message KG, find the total count of node with the same type of source node ## Note: Regardless of whether kg='KG1' or kg='KG2' is specified in self.size_of_given_type_in_KP, it will always query total count based on kg2c if kp=='ARAX/KG1' or kp=='infores:rtx-kg2': size_of_total = self.size_of_given_type_in_KP(node_type=subject_node_category[0]) self.response.debug(f"Total {size_of_total} unique concepts with node category {subject_node_category[0]} was found in KG2c based on 'nodesynonymizer.get_total_entity_count' and this number will be used for Fisher's Exact Test") else: self.response.error(f"Only KG1 or KG2 is allowable to calculate the Fisher's exact test temporally") return self.response size_of_query_sample = len(subject_node_list) self.response.debug(f"Computing Fisher's Exact Test P-value") # calculate FET p-value for each target node in parallel parameter_list = [] del_list = [] for node in object_node_dict: temp = [len(object_node_dict[node]), size_of_object[node]-len(object_node_dict[node]), size_of_query_sample - len(object_node_dict[node]), (size_of_total - size_of_object[node]) - (size_of_query_sample - len(object_node_dict[node]))] if any([value < 0 for value in temp]) is True: del_list.append(node) self.response.warning(f"Skipping node {node} to calculate FET p-value due to issue1438 (which causes negative value).") for del_node in del_list: del object_node_dict[del_node] parameter_list = [(node, len(object_node_dict[node]), size_of_object[node]-len(object_node_dict[node]), size_of_query_sample - len(object_node_dict[node]), (size_of_total - size_of_object[node]) - (size_of_query_sample - len(object_node_dict[node]))) for node in object_node_dict] try: # with multiprocessing.Pool() as executor: # FETpvalue_list = [elem for elem in executor.map(self._calculate_FET_pvalue_parallel, parameter_list)] FETpvalue_list = [elem for elem in map(self._calculate_FET_pvalue_parallel, parameter_list)] except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong with computing Fisher's Exact Test P-value") return self.response if any([type(elem) is list for elem in FETpvalue_list]): for msg in [elem2 for elem1 in FETpvalue_list if type(elem1) is list for elem2 in elem1]: if type(msg) is tuple: self.response.error(msg[0], error_code=msg[1]) else: self.response.error(msg) return self.response else: output = dict(FETpvalue_list) # check if the results need to be filtered output = dict(sorted(output.items(), key=lambda x: x[1])) if cutoff: output = dict(filter(lambda x: x[1] < cutoff, output.items())) else: pass if top_n: output = dict(list(output.items())[:top_n]) else: pass # add the virtual edge with FET result to message KG self.response.debug(f"Adding virtual edge with FET result to message KG") count = 0 for index, value in enumerate([(virtual_relation_label, output[adj], node, adj) for adj in object_node_dict if adj in output.keys() for node in object_node_dict[adj]], 1): edge_attribute_list = [ EdgeAttribute(attribute_type_id="EDAM:data_1669", original_attribute_name="fisher_exact_test_p-value", value=str(value[1]), value_url=None), EdgeAttribute(original_attribute_name="virtual_relation_label", value=value[0], attribute_type_id="biolink:Unknown"), #EdgeAttribute(original_attribute_name="is_defined_by", value="ARAX", attribute_type_id="biolink:Unknown"), EdgeAttribute(original_attribute_name="defined_datetime", value=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), attribute_type_id="metatype:Datetime"), EdgeAttribute(original_attribute_name="provided_by", value="infores:arax", attribute_type_id="biolink:aggregator_knowledge_source", attribute_source="infores:arax", value_type_id="biolink:InformationResource"), EdgeAttribute(original_attribute_name=None, value=True, attribute_type_id="biolink:computed_value", attribute_source="infores:arax-reasoner-ara", value_type_id="metatype:Boolean", value_url=None, description="This edge is a container for a computed value between two nodes that is not directly attachable to other edges.") #EdgeAttribute(original_attribute_name="confidence", value=None, type="biolink:ConfidenceLevel"), #EdgeAttribute(original_attribute_name="weight", value=None, type="metatype:Float") ] edge_id = f"{value[0]}_{index}" edge = Edge(predicate='biolink:has_fisher_exact_test_p_value_with', subject=value[2], object=value[3], attributes=edge_attribute_list) edge.qedge_keys = [value[0]] self.message.knowledge_graph.edges[edge_id] = edge if self.message.results is not None and len(self.message.results) > 0: ou.update_results_with_overlay_edge(subject_knode_key=value[2], object_knode_key=value[3], kedge_key=edge_id, message=self.message, log=self.response) count = count + 1 self.response.debug(f"{count} new virtual edges were added to message KG") # add the virtual edge to message QG if count > 0: self.response.debug(f"Adding virtual edge to message QG") edge_type = ["biolink:has_fisher_exact_test_p_value_with"] option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, self.message.query_graph, self.response) qedge_id = virtual_relation_label q_edge = QEdge(predicates=edge_type, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) q_edge.relation = virtual_relation_label self.message.query_graph.edges[qedge_id] = q_edge self.response.debug(f"One virtual edge was added to message QG") return self.response
def compute_jaccard(self): message = self.message parameters = self.parameters self.response.debug( f"Computing Jaccard distance and adding this information as virtual edges" ) self.response.info( f"Computing Jaccard distance and adding this information as virtual edges" ) self.response.info("Getting all relevant nodes") # TODO: should I check that they're connected to the start node, or just assume that they are? # TODO: For now, assume that they are try: intermediate_nodes = set() end_node_to_intermediate_node_set = dict( ) # keys will be end node curies, values will be tuples the (intermediate curie ids, edge_type) for key, node in message.knowledge_graph.nodes.items(): if parameters['intermediate_node_key'] in node.qnode_keys: intermediate_nodes.add( key) # add the intermediate node by it's identifier # also look for the subject node id if parameters['start_node_key'] in node.qnode_keys: subject_node_key = key if parameters['end_node_key'] in node.qnode_keys: end_node_to_intermediate_node_set[key] = set() # now iterate over the edges to look for the ones we need to add # TODO: Here, I won't care which direction the edges are pointing for edge in message.knowledge_graph.edges.values(): if edge.subject in intermediate_nodes: # if subject is intermediate if edge.object in end_node_to_intermediate_node_set: # end_node_to_intermediate_node_set[edge.object].add((edge.subject, edge.predicate)) # add subjectend_node_to_intermediate_node_set[edge.object].add((edge.subject, edge.predicate)) # FW: Old way was to add in unique predicate, node id pairs but then count total number of intermediate nodes. # I've now changed this to add only node ids on both but we could change back but instead count all pairs for the demoninator. end_node_to_intermediate_node_set[edge.object].add( edge.subject) elif edge.object in intermediate_nodes: # if object is intermediate if edge.subject in end_node_to_intermediate_node_set: # end_node_to_intermediate_node_set[edge.subject].add((edge.object, edge.predicate)) # add object end_node_to_intermediate_node_set[edge.subject].add( edge.object) # now compute the actual jaccard indexes denom = len(intermediate_nodes) end_node_to_jaccard = dict() for end_node_key in end_node_to_intermediate_node_set: # TODO: add code here if you care about edge types numerator = len( end_node_to_intermediate_node_set[end_node_key]) jacc = numerator / float(denom) end_node_to_jaccard[end_node_key] = jacc # now add them all as virtual edges # edge properties j_iter = 0 now = datetime.now() #edge_type = parameters['virtual_edge_type'] edge_type = 'biolink:has_jaccard_index_with' qedge_keys = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "infores:arax" confidence = None weight = None # TODO: could make the jaccard index the weight try: subject_key = subject_node_key except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.warning( f"subject node id: {parameters['start_node_key']} not found in the KG. Perhaps the KG is empty?" ) #self.response.error(tb, error_code=error_type.__name__) # edge attribute properties description = f"Jaccard index based on intermediate query nodes {parameters['intermediate_node_key']}" attribute_type = 'EDAM:data_1772' name = "jaccard_index" url = None # now actually add the virtual edges in for end_node_key, value in end_node_to_jaccard.items(): edge_attribute = EdgeAttribute( attribute_type_id=attribute_type, original_attribute_name=name, value=value, value_url=url) # try to ensure a unique edge id id = f"J{j_iter}" # if by chance you get the same id then loop until a unique one is generated # probably a btter way of doing this but need to check how ids are generated in expand first while id in message.knowledge_graph.edges: id = f"J{j_iter}.{random.randint(10**(9-1), (10**9)-1)}" j_iter += 1 object_key = end_node_key # likely will need to fix this for TRAPI 1.0 after being able to test # Do these need a attribute type and url? edge_attribute_list = [ edge_attribute, EdgeAttribute( original_attribute_name="virtual_relation_label", value=relation, attribute_type_id="biolink:Unknown"), #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"), EdgeAttribute(original_attribute_name="defined_datetime", value=defined_datetime, attribute_type_id="metatype:Datetime"), EdgeAttribute( original_attribute_name="provided_by", value=provided_by, attribute_type_id="biolink:aggregator_knowledge_source", attribute_source=provided_by, value_type_id="biolink:InformationResource"), EdgeAttribute( original_attribute_name=None, value=True, attribute_type_id="biolink:computed_value", attribute_source="infores:arax-reasoner-ara", value_type_id="metatype:Boolean", value_url=None, description= "This edge is a container for a computed value between two nodes that is not directly attachable to other edges." ) #EdgeAttribute(name="confidence", value=confidence, attribute_type_id="biolink:ConfidenceLevel"), #EdgeAttribute(name="weight", value=weight, attribute_type_id="metatype:Float"), #EdgeAttribute(name="qedge_ids", value=qedge_ids) ] # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key, object_key=object_key, # is_defined_by=is_defined_by, defined_datetime=defined_datetime, provided_by=provided_by, # confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids) edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, attributes=edge_attribute_list) edge.qedge_keys = qedge_keys message.knowledge_graph.edges[id] = edge # Now add a q_edge the query_graph since I've added an extra edge to the KG subject_qnode_key = parameters['start_node_key'] object_qnode_key = parameters['end_node_key'] option_group_id = ou.determine_virtual_qedge_option_group( subject_qnode_key, object_qnode_key, self.message.query_graph, self.response) # q_edge = QEdge(id=relation, type=edge_type, relation=relation, subject_key=subject_qnode_key, # object_key=object_qnode_key, option_group_id=option_group_id) # TODO: ok to make the id and type the same thing? # Does not look to be a way to add option group ids to the new QEdge in TRAPI 1.0? Will error as written now q_edge = QEdge(predicates=[edge_type], subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) q_edge.relation = relation # Need to fix this for TRAPI 1.0 self.message.query_graph.edges[relation] = q_edge return self.response except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error( f"Something went wrong when computing the Jaccard index") self.response.error(tb, error_code=error_type.__name__)
def fisher_exact_test(self): """ Peform the fisher's exact test to expand or decorate the knowledge graph :return: response """ self.response.info(f"Performing Fisher's Exact Test to add p-value to edge attribute of virtual edge") # check the input parameters if 'subject_qnode_key' not in self.parameters: self.response.error(f"The argument 'subject_qnode_key' is required for fisher_exact_test function") return self.response else: subject_qnode_key = self.parameters['subject_qnode_key'] if 'virtual_relation_label' not in self.parameters: self.response.error(f"The argument 'virtual_relation_label' is required for fisher_exact_test function") return self.response else: virtual_relation_label = str(self.parameters['virtual_relation_label']) if 'object_qnode_key' not in self.parameters: self.response.error(f"The argument 'object_qnode_key' is required for fisher_exact_test function") return self.response else: object_qnode_key = self.parameters['object_qnode_key'] rel_edge_key = self.parameters['rel_edge_key'] if 'rel_edge_key' in self.parameters else None top_n = int(self.parameters['top_n']) if 'top_n' in self.parameters else None cutoff = float(self.parameters['cutoff']) if 'cutoff' in self.parameters else None # initialize some variables nodes_info = {} edge_expand_kp = [] subject_node_list = [] object_node_dict = {} size_of_object = {} subject_node_exist = False object_node_exist = False query_edge_key = set() rel_edge_type = set() subject_node_category = None object_node_category= None ## Check if subject_qnode_key and object_qnode_key are in the Query Graph try: if len(self.message.query_graph.nodes) != 0: for node_key in self.message.query_graph.nodes: if node_key == subject_qnode_key: subject_node_exist = True subject_node_category = self.message.query_graph.nodes[node_key].category elif node_key == object_qnode_key: object_node_exist = True object_node_category = self.message.query_graph.nodes[node_key].category else: pass else: self.response.error(f"There is no query node in QG") return self.response except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong with retrieving nodes in message QG") return self.response if subject_node_exist: if object_node_exist: pass else: self.response.error(f"No query node with object qnode key {object_qnode_key} detected in QG for Fisher's Exact Test") return self.response else: self.response.error(f"No query node with subject qnode key {subject_qnode_key} detected in QG for Fisher's Exact Test") return self.response ## Check if there is a query edge connected to both subject_qnode_key and object_qnode_key in the Query Graph try: if len(self.message.query_graph.edges) != 0: for edge_key in self.message.query_graph.edges: if self.message.query_graph.edges[edge_key].subject == subject_qnode_key and self.message.query_graph.edges[edge_key].object == object_qnode_key and self.message.query_graph.edges[edge_key].relation == None: query_edge_key.update([edge_key]) # only actual query edge is added elif self.message.query_graph.edges[edge_key].subject == object_qnode_key and self.message.query_graph.edges[edge_key].object == subject_qnode_key and self.message.query_graph.edges[edge_key].relation == None: query_edge_key.update([edge_key]) # only actual query edge is added else: continue else: self.response.error(f"There is no query edge in Query Graph") return self.response except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong with retrieving edges in message QG") return self.response if len(query_edge_key)!=0: if rel_edge_key: if rel_edge_key in query_edge_key: pass else: self.response.error(f"No query edge with qedge key {rel_edge_key} connected to both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key} detected in QG for Fisher's Exact Test") return self.response else: pass else: self.response.error( f"No query edge connected to both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key} detected in QG for Fisher's Exact Test") return self.response ## loop over all nodes in KG and collect their node information try: count = 0 for node_key, node in self.message.knowledge_graph.nodes.items(): nodes_info[node_key] = {'count': count, 'qnode_keys': node.qnode_keys, 'category': self.message.knowledge_graph.nodes[node_key].category, 'edge_index': []} count = count + 1 except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong with retrieving nodes in message KG") return self.response ## loop over all edges in KG and create subject node list and target node dict based on subject_qnode_key, object_qnode_key as well as rel_edge_id (optional, otherwise all edges are considered) try: count = 0 for edge_key, edge in self.message.knowledge_graph.edges.items(): edge_attribute_dict = {x.name:x.value for x in self.message.knowledge_graph.edges[edge_key].attributes} if edge_attribute_dict['is_defined_by'] != 'ARAX': nodes_info[self.message.knowledge_graph.edges[edge_key].subject]['edge_index'].append(count) nodes_info[self.message.knowledge_graph.edges[edge_key].object]['edge_index'].append(count) if rel_edge_key: if rel_edge_key in edge.qedge_keys: if subject_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].subject]['qnode_keys']: edge_expand_kp.append(edge_attribute_dict['is_defined_by']) rel_edge_type.update([self.message.knowledge_graph.edges[edge_key].predicate]) subject_node_list.append(self.message.knowledge_graph.edges[edge_key].subject) if self.message.knowledge_graph.edges[edge_key].object not in object_node_dict.keys(): object_node_dict[self.message.knowledge_graph.edges[edge_key].object] = {self.message.knowledge_graph.edges[edge_key].subject} else: object_node_dict[self.message.knowledge_graph.edges[edge_key].object].update([self.message.knowledge_graph.edges[edge_key].subject]) else: edge_expand_kp.append(edge_attribute_dict['is_defined_by']) rel_edge_type.update([self.message.knowledge_graph.edges[edge_key].predicate]) subject_node_list.append(self.message.knowledge_graph.edges[edge_key].object) if self.message.knowledge_graph.edges[edge_key].subject not in object_node_dict.keys(): object_node_dict[self.message.knowledge_graph.edges[edge_key].subject] = {self.message.knowledge_graph.edges[edge_key].object} else: object_node_dict[self.message.knowledge_graph.edges[edge_key].subject].update([self.message.knowledge_graph.edges[edge_key].object]) else: pass else: if subject_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].subject]['qnode_keys']: if object_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].object]['qnode_keys']: edge_expand_kp.append(edge_attribute_dict['is_defined_by']) subject_node_list.append(self.message.knowledge_graph.edges[edge_key].subject) if self.message.knowledge_graph.edges[edge_key].object not in object_node_dict.keys(): object_node_dict[self.message.knowledge_graph.edges[edge_key].object] = {self.message.knowledge_graph.edges[edge_key].subject} else: object_node_dict[self.message.knowledge_graph.edges[edge_key].object].update([self.message.knowledge_graph.edges[edge_key].subject]) else: pass elif object_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].subject]['qnode_keys']: if subject_qnode_key in nodes_info[self.message.knowledge_graph.edges[edge_key].object]['qnode_keys']: edge_expand_kp.append(edge_attribute_dict['is_defined_by']) subject_node_list.append(self.message.knowledge_graph.edges[edge_key].object) if self.message.knowledge_graph.edges[edge_key].subject not in object_node_dict.keys(): object_node_dict[self.message.knowledge_graph.edges[edge_key].subject] = {self.message.knowledge_graph.edges[edge_key].object} else: object_node_dict[self.message.knowledge_graph.edges[edge_key].subject].update([self.message.knowledge_graph.edges[edge_key].object]) else: pass else: pass else: pass count = count + 1 ## record edge position in message.knowledge_graph except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong with retrieving edges in message KG") return self.response subject_node_list = list(set(subject_node_list)) ## remove the duplicate subject node key ## check if there is no subject node in message KG if len(subject_node_list) == 0: self.response.error(f"No subject node found in message KG for Fisher's Exact Test") return self.response ## check if there is no object node in message KG if len(object_node_dict) == 0: self.response.error(f"No object node found in message KG for Fisher's Exact Test") return self.response ## check if subject node has more than one type. If so, throw an error if subject_node_category is None: self.response.error(f"Subject node with qnode key {subject_qnode_key} was set to None in Query Graph. Please specify the node type") return self.response else: pass ## check if object node has more than one type. If so, throw an error if object_node_category is None: self.response.error(f"Object node with qnode key {object_qnode_key} was set to None in Query Graph. Please specify the node type") return self.response else: pass ##check how many kps were used in message KG. If more than one, the one with the max number of edges connnected to both subject nodes and object nodes was used if len(collections.Counter(edge_expand_kp))==1: kp = edge_expand_kp[0] else: occurrences = collections.Counter(edge_expand_kp) max_index = max([(value, index) for index, value in enumerate(occurrences.values())])[1] # if there are more than one kp having the maximum number of edges, then the last one based on alphabetical order will be chosen. kp = list(occurrences.keys())[max_index] self.response.debug(f"{occurrences}") self.response.warning(f"More than one knowledge provider was detected to be used for expanding the edges connected to both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key}") self.response.warning(f"The knowledge provider {kp} was used to calculate Fisher's exact test because it has the maximum number of edges both subject node with qnode key {subject_qnode_key} and object node with qnode key {object_qnode_key}") ## Print out some information used to calculate FET if len(subject_node_list) == 1: self.response.debug(f"{len(subject_node_list)} subject node with qnode key {subject_qnode_key} and node type {subject_node_category} was found in message KG and used to calculate Fisher's Exact Test") else: self.response.debug(f"{len(subject_node_list)} subject nodes with qnode key {subject_qnode_key} and node type {subject_node_category} was found in message KG and used to calculate Fisher's Exact Test") if len(object_node_dict) == 1: self.response.debug(f"{len(object_node_dict)} object node with qnode key {object_qnode_key} and node type {object_node_category} was found in message KG and used to calculate Fisher's Exact Test") else: self.response.debug(f"{len(object_node_dict)} object nodes with qnode key {object_qnode_key} and node type {object_node_category} was found in message KG and used to calculate Fisher's Exact Test") # find all nodes with the same type of 'subject_qnode_key' nodes in specified KP ('ARAX/KG1','ARAX/KG2','BTE') that are adjacent to target nodes use_parallel = False if not use_parallel: # query adjacent node in one DSL command by providing a list of query nodes to add_qnode() if rel_edge_key: if len(rel_edge_type) == 1: # if the edge with rel_edge_key has only type, we use this rel_edge_predicate to find all subject nodes in KP self.response.debug(f"{kp} and edge relation type {list(rel_edge_type)[0]} were used to calculate total object nodes in Fisher's Exact Test") result = self.query_size_of_adjacent_nodes(node_curie=list(object_node_dict.keys()), source_type=object_node_category, adjacent_type=subject_node_category, kp = kp, rel_type=list(rel_edge_type)[0], use_cypher_command=False) else: # if the edge with rel_edge_key has more than one type, we ignore the edge predicate and use all categories to find all subject nodes in KP self.response.warning(f"The edges with specified qedge key {rel_edge_key} have more than one category, we ignore the edge predicate and use all categories to calculate Fisher's Exact Test") self.response.debug(f"{kp} was used to calculate total object nodes in Fisher's Exact Test") result = self.query_size_of_adjacent_nodes(node_curie=list(object_node_dict.keys()), source_type=object_node_category, adjacent_type=subject_node_category, kp=kp, rel_type=None, use_cypher_command=False) else: # if no rel_edge_key is specified, we ignore the edge predicate and use all categories to find all subject nodes in KP self.response.debug(f"{kp} was used to calculate total object nodes in Fisher's Exact Test") result = self.query_size_of_adjacent_nodes(node_curie=list(object_node_dict.keys()), source_type=object_node_category, adjacent_type=subject_node_category, kp=kp, rel_type=None, use_cypher_command=False) if result is None: return self.response ## Something wrong happened for querying the adjacent nodes else: res, removed_nodes = result if len(removed_nodes)==0: size_of_object = res else: if len(removed_nodes) == 1: self.response.warning(f"One object node which is {removed_nodes[0]} can't find its neighbors. This node will be ignored for FET calculation.") else: self.response.warning(f"{len(removed_nodes)} object nodes which are {removed_nodes} can't find its neighbors. These nodes will be ignored for FET calculation.") for node in removed_nodes: del object_node_dict[node] size_of_object = res else: # query adjacent node for query nodes one by one in parallel if rel_edge_key: if len(rel_edge_type) == 1: # if the edge with rel_edge_key has only type, we use this rel_edge_predicate to find all subject nodes in KP self.response.debug(f"{kp} and edge relation type {list(rel_edge_type)[0]} were used to calculate total adjacent nodes in Fisher's Exact Test") parameter_list = [(node, object_node_category, subject_node_category, kp, list(rel_edge_type)[0]) for node in list(object_node_dict.keys())] else: # if the edge with rel_edge_key has more than one type, we ignore the edge type and use all types to find all source nodes in KP self.response.warning(f"The edges with specified qedge key {rel_edge_key} have more than one type, we ignore the edge type and use all types to calculate Fisher's Exact Test") self.response.debug(f"{kp} was used to calculate total adjacent nodes in Fisher's Exact Test") parameter_list = [(node, object_node_category, subject_node_category, kp, None) for node in list(object_node_dict.keys())] else: # if no rel_edge_key is specified, we ignore the edge type and use all types to find all source nodes in KP self.response.debug(f"{kp} was used to calculate total adjacent nodes in Fisher's Exact Test") parameter_list = [(node, object_node_category, subject_node_category, kp, None) for node in list(object_node_dict.keys())] ## get the count of all nodes with the type of 'subject_qnode_key' nodes in KP for each target node in parallel try: with multiprocessing.Pool() as executor: object_count_res = [elem for elem in executor.map(self._query_size_of_adjacent_nodes_parallel, parameter_list)] except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong with querying adjacent nodes in parallel") return self.response if any([type(elem) is list for elem in object_count_res]): for msg in [elem2 for elem1 in object_count_res if type(elem1) is list for elem2 in elem1]: if type(msg) is tuple: self.response.error(msg[0], error_code=msg[1]) else: self.response.error(msg) return self.response ## Something wrong happened for querying the adjacent nodes else: for index in range(len(object_node_dict)): node = list(object_node_dict.keys())[index] size_of_object[node] = object_count_res[index] if len(object_node_dict) != 0: ## Based on KP detected in message KG, find the total number of node with the same type of source node if kp=='ARAX/KG1': size_of_total = self.size_of_given_type_in_KP(node_type=subject_node_category, use_cypher_command=False, kg='KG1') if size_of_total != 0: self.response.debug(f"ARAX/KG1 and cypher query were used to calculate total number of node with the same type of source node in Fisher's Exact Test") self.response.debug(f"Total {size_of_total} unique concepts with node category {subject_node_category} was found in ARAX/KG1") else: size_of_total = self.size_of_given_type_in_KP(node_type=subject_node_category, use_cypher_command=False, kg='KG2') ## If cypher query fails, then try kgNodeIndex if size_of_total==0: self.response.error(f"Both KG1 and KG2 have 0 node with the same type of subject node with qnode key {subject_qnode_key}") return self.response else: self.response.debug(f"Since KG1 can't find the any nodes with node category {subject_node_category}, ARAX/KG2C were used to calculate total number of node with the same type of source node in Fisher's Exact Test") self.response.debug(f"Total {size_of_total} unique concepts with node category {subject_node_category} was found in ARAX/KG2C") elif kp=='ARAX/KG2' or kp == 'ARAX/KG2c': ## check KG1 first as KG2 might have many duplicates. If KG1 is 0, then check KG2 size_of_total = self.size_of_given_type_in_KP(node_type=subject_node_category, use_cypher_command=False, kg='KG2') ## Try cypher query first self.response.debug(f"ARAX/KG2C were used to calculate total number of node with the same type of source node in Fisher's Exact Test") self.response.debug(f"Total {size_of_total} unique concepts with node category {subject_node_category} was found in ARAX/KG2C") else: self.response.error(f"Only KG1 or KG2 is allowable to calculate the Fisher's exact test temporally") return self.response size_of_query_sample = len(subject_node_list) self.response.debug(f"Computing Fisher's Exact Test P-value") # calculate FET p-value for each target node in parallel del_list = [] parameter_list = [] for node in object_node_dict: if size_of_object[node]-len(object_node_dict[node]) < 0: del_list.append(node) self.response.warning(f"Skipping node {node} to calculate FET p-value due to issue897 (which causes negative value).") continue else: parameter_list += [(node, len(object_node_dict[node]), size_of_object[node]-len(object_node_dict[node]), size_of_query_sample - len(object_node_dict[node]), (size_of_total - size_of_object[node]) - (size_of_query_sample - len(object_node_dict[node])))] for del_node in del_list: del object_node_dict[del_node] # parameter_list = [(node, len(target_node_dict[node]), size_of_target[node]-len(target_node_dict[node]), size_of_query_sample - len(target_node_dict[node]), (size_of_total - size_of_target[node]) - (size_of_query_sample - len(target_node_dict[node]))) for node in target_node_dict] try: with multiprocessing.Pool() as executor: FETpvalue_list = [elem for elem in executor.map(self._calculate_FET_pvalue_parallel, parameter_list)] except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong with computing Fisher's Exact Test P-value") return self.response if any([type(elem) is list for elem in FETpvalue_list]): for msg in [elem2 for elem1 in FETpvalue_list if type(elem1) is list for elem2 in elem1]: if type(msg) is tuple: self.response.error(msg[0], error_code=msg[1]) else: self.response.error(msg) return self.response else: output = dict(FETpvalue_list) # check if the results need to be filtered output = dict(sorted(output.items(), key=lambda x: x[1])) if cutoff: output = dict(filter(lambda x: x[1] < cutoff, output.items())) else: pass if top_n: output = dict(list(output.items())[:top_n]) else: pass # add the virtual edge with FET result to message KG self.response.debug(f"Adding virtual edge with FET result to message KG") count = 0 for index, value in enumerate([(virtual_relation_label, output[adj], node, adj) for adj in object_node_dict if adj in output.keys() for node in object_node_dict[adj]], 1): edge_attribute_list = [ EdgeAttribute(type="EDAM:data_1669", name="fisher_exact_test_p-value", value=str(value[1]), url=None), EdgeAttribute(name="is_defined_by", value="ARAX", type="ARAX_TYPE_PLACEHOLDER"), EdgeAttribute(name="defined_datetime", value=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), type="metatype:Datetime"), EdgeAttribute(name="provided_by", value="ARAX", type="biolink:provided_by"), #EdgeAttribute(name="confidence", value=None, type="biolink:ConfidenceLevel"), #EdgeAttribute(name="weight", value=None, type="metatype:Float") ] edge_id = f"{value[0]}_{index}" edge = Edge(predicate='biolink:has_fisher_exact_test_p-value_with', subject=value[2], object=value[3], relation=value[0], attributes=edge_attribute_list) edge.qedge_keys = [value[0]] self.message.knowledge_graph.edges[edge_id] = edge count = count + 1 self.response.debug(f"{count} new virtual edges were added to message KG") # add the virtual edge to message QG if count > 0: self.response.debug(f"Adding virtual edge to message QG") edge_type = "biolink:has_fisher_exact_test_p-value_with" option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, self.message.query_graph, self.response) qedge_id = virtual_relation_label q_edge = QEdge(predicate=edge_type, relation=virtual_relation_label, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) self.message.query_graph.edges[qedge_id] = q_edge self.response.debug(f"One virtual edge was added to message QG") return self.response
def compute_ngd(self): """ Iterate over all the edges in the knowledge graph, compute the normalized google distance and stick that info on the attributes :default: The default value to set for NGD if it returns a nan :return: response """ if self.response.status != 'OK': # Catches any errors that may have been logged during initialization self._close_database() return self.response parameters = self.parameters self.response.debug(f"Computing NGD") self.response.info(f"Computing the normalized Google distance: weighting edges based on subject/object node " f"co-occurrence frequency in PubMed abstracts") name = "normalized_google_distance" type = "EDAM:data_2526" value = self.parameters['default_value'] url = "https://arax.ncats.io/api/rtx/v1/ui/#/PubmedMeshNgd" qg = self.message.query_graph kg = self.message.knowledge_graph # if you want to add virtual edges, identify the subject/objects, decorate the edges, add them to the KG, and then add one to the QG corresponding to them if 'virtual_relation_label' in parameters: # Figure out which node pairs to compute NGD between subject_qnode_key = parameters['subject_qnode_key'] object_qnode_key = parameters['object_qnode_key'] node_pairs_to_evaluate = ou.get_node_pairs_to_overlay(subject_qnode_key, object_qnode_key, qg, kg, self.response) # Grab PMID lists for all involved nodes involved_curies = {curie for node_pair in node_pairs_to_evaluate for curie in node_pair} canonicalized_curie_lookup = self._get_canonical_curies_map(list(involved_curies)) self.load_curie_to_pmids_data(canonicalized_curie_lookup.values()) added_flag = False # check to see if any edges where added self.response.debug(f"Looping through {len(node_pairs_to_evaluate)} node pairs and calculating NGD values") # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (subject_curie, object_curie) in node_pairs_to_evaluate: # create the edge attribute if it can be canonical_subject_curie = canonicalized_curie_lookup.get(subject_curie, subject_curie) canonical_object_curie = canonicalized_curie_lookup.get(object_curie, object_curie) ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie) if np.isfinite(ngd_value): # if ngd is finite, that's ok, otherwise, stay with default value = ngd_value edge_attribute = EdgeAttribute(type=type, name=name, value=str(value), url=url) # populate the NGD edge attribute pmid_attribute = EdgeAttribute(type="biolink:publications", name="publications", value=[f"PMID:{pmid}" for pmid in pmid_set]) if edge_attribute: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "biolink:has_normalized_google_distance_with" qedge_keys = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute subject_key = subject_curie object_key = object_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" # ensure the id is unique # might need to change after expand is implemented for TRAPI 1.0 while id in self.message.knowledge_graph.edges: id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}" self.global_iter += 1 edge_attribute_list = [ edge_attribute, pmid_attribute, EdgeAttribute(name="is_defined_by", value=is_defined_by, type="ARAX_TYPE_PLACEHOLDER"), EdgeAttribute(name="defined_datetime", value=defined_datetime, type="metatype:Datetime"), EdgeAttribute(name="provided_by", value=provided_by, type="biolink:provided_by"), #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"), #EdgeAttribute(name="weight", value=weight, type="metatype:Float"), #EdgeAttribute(name="qedge_keys", value=qedge_keys) ] # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key, # object_key=object_key, # is_defined_by=is_defined_by, defined_datetime=defined_datetime, # provided_by=provided_by, # confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids) edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation, attributes=edge_attribute_list) edge.qedge_keys = qedge_keys self.message.knowledge_graph.edges[id] = edge # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: #edge_type = parameters['virtual_edge_type'] edge_type = "biolink:has_normalized_google_distance_with" relation = parameters['virtual_relation_label'] option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, qg, self.response) # q_edge = QEdge(id=relation, type=edge_type, relation=relation, # subject_key=subject_qnode_key, object_key=object_qnode_key, # option_group_id=option_group_id) q_edge = QEdge(predicate=edge_type, relation=relation, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) self.message.query_graph.edges[relation]=q_edge self.response.info(f"NGD values successfully added to edges") else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: # Map all nodes to their canonicalized curies in one batch (need canonical IDs for the local NGD system) canonicalized_curie_map = self._get_canonical_curies_map([key for key in self.message.knowledge_graph.nodes.keys()]) self.load_curie_to_pmids_data(canonicalized_curie_map.values()) self.response.debug(f"Looping through edges and calculating NGD values") for edge in self.message.knowledge_graph.edges.values(): # Make sure the attributes are not None if not edge.attributes: edge.attributes = [] # should be an array, but why not a list? # now go and actually get the NGD subject_curie = edge.subject object_curie = edge.object canonical_subject_curie = canonicalized_curie_map.get(subject_curie, subject_curie) canonical_object_curie = canonicalized_curie_map.get(object_curie, object_curie) ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie) if np.isfinite(ngd_value): # if ngd is finite, that's ok, otherwise, stay with default value = ngd_value ngd_edge_attribute = EdgeAttribute(type=type, name=name, value=str(value), url=url) # populate the NGD edge attribute pmid_edge_attribute = EdgeAttribute(type="biolink:publications", name="ngd_publications", value=[f"PMID:{pmid}" for pmid in pmid_set]) edge.attributes.append(ngd_edge_attribute) # append it to the list of attributes edge.attributes.append(pmid_edge_attribute) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong adding the NGD edge attributes") else: self.response.info(f"NGD values successfully added to edges") self._close_database() return self.response
def add_virtual_edge(self, name="", default=0.): """ Generic function to add a virtual edge to the KG an QG :name: name of the functionality of the KP to use """ parameters = self.parameters subject_curies_to_decorate = set() object_curies_to_decorate = set() curies_to_names = dict( ) # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs # identify the nodes that we should be adding virtual edges for for key, node in self.message.knowledge_graph.nodes.items(): if hasattr(node, 'qnode_keys'): if parameters['subject_qnode_key'] in node.qnode_keys: subject_curies_to_decorate.add(key) curies_to_names[ key] = node.name # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs if parameters['object_qnode_key'] in node.qnode_keys: object_curies_to_decorate.add(key) curies_to_names[ key] = node.name # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs added_flag = False # check to see if any edges where added # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (subject_curie, object_curie) in itertools.product(subject_curies_to_decorate, object_curies_to_decorate): # create the edge attribute if it can be edge_attribute = self.make_edge_attribute_from_curies( subject_curie, object_curie, subject_name=curies_to_names[subject_curie], object_name=curies_to_names[object_curie], default=default, name=name) if edge_attribute: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = f"biolink:has_{name}_with" qedge_keys = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute subject_key = subject_curie object_key = object_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" # ensure the id is unique # might need to change after expand is implemented for TRAPI 1.0 while id in self.message.knowledge_graph.edges: id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}" self.global_iter += 1 edge_attribute_list = [ edge_attribute, EdgeAttribute(name="is_defined_by", value=is_defined_by, type="ARAX_TYPE_PLACEHOLDER"), EdgeAttribute(name="defined_datetime", value=defined_datetime, type="metatype:Datetime"), EdgeAttribute(name="provided_by", value=provided_by, type="biolink:provided_by"), #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"), #EdgeAttribute(name="weight", value=weight, type="metatype:Float"), #EdgeAttribute(name="qedge_ids", value=qedge_ids) ] # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key, # object_key=object_key, # is_defined_by=is_defined_by, defined_datetime=defined_datetime, # provided_by=provided_by, # confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids) edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation, attributes=edge_attribute_list) edge.qedge_keys = qedge_keys self.message.knowledge_graph.edges[id] = edge # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: edge_type = f"biolink:has_{name}_with" relation = parameters['virtual_relation_label'] qedge_keys = [parameters['virtual_relation_label']] subject_qnode_key = parameters['subject_qnode_key'] object_qnode_key = parameters['object_qnode_key'] option_group_id = ou.determine_virtual_qedge_option_group( subject_qnode_key, object_qnode_key, self.message.query_graph, self.response) # q_edge = QEdge(id=relation, type=edge_type, relation=relation, # subject_key=subject_qnode_key, object_key=object_qnode_key, # option_group_id=option_group_id) # TODO: ok to make the id and type the same thing? q_edge = QEdge(predicate=edge_type, relation=relation, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) self.message.query_graph.edges[relation] = q_edge
def compute_ngd(self): """ Iterate over all the edges in the knowledge graph, compute the normalized google distance and stick that info on the attributes :default: The default value to set for NGD if it returns a nan :return: response """ if self.response.status != 'OK': # Catches any errors that may have been logged during initialization self._close_database() return self.response parameters = self.parameters self.response.debug(f"Computing NGD") self.response.info(f"Computing the normalized Google distance: weighting edges based on subject/object node " f"co-occurrence frequency in PubMed abstracts") name = "normalized_google_distance" type = "EDAM:data_2526" default_value = self.parameters['default_value'] url = "https://arax.ncats.io/api/rtx/v1/ui/#/PubmedMeshNgd" qg = self.message.query_graph kg = self.message.knowledge_graph ngd_description = """ Normalized google distance is a metric based on edge subject/object node co-occurrence in abstracts of all [PubMed](https://pubmed.ncbi.nlm.nih.gov/) articles. The formula can be found here on [wikipedia.](https://en.wikipedia.org/wiki/Normalized_Google_distance) Where in this case f(x,y) is the number of PubMed abstracts both concepts apear in, f(x)/f(y) are the number of abstracts individual concepts apear in, and N is the number of pubmed articles times the average numbver of search terms per article (27 million * 20). """ # if you want to add virtual edges, identify the subject/objects, decorate the edges, add them to the KG, and then add one to the QG corresponding to them # FW: changing this so if there is a virtual relation label but no subject and object then add edges for all subject object pairs in the quesry graph. if 'subject_qnode_key' not in parameters and 'object_qnode_key' not in parameters and 'virtual_relation_label' in parameters: seen_node_pairs = set() qgraph_edges = copy.deepcopy(list(qg.edges.values())) for query_edge in qgraph_edges: subject_qnode_key = query_edge.subject object_qnode_key = query_edge.object if subject_qnode_key < object_qnode_key: qnode_key_pair = (subject_qnode_key,object_qnode_key) else: qnode_key_pair = (object_qnode_key,subject_qnode_key) # FW: check if we have already added an edge for this pair if qnode_key_pair in seen_node_pairs: pass else: seen_node_pairs.add(qnode_key_pair) # FW: Now add the edge for this qnode pair # FW NOTE: If we decide to keep these changes we should really pull this out into a method as everything after this was copy pasted from below in the 'virtual_relation_label' in parameters section node_pairs_to_evaluate = ou.get_node_pairs_to_overlay(subject_qnode_key, object_qnode_key, qg, kg, self.response) # Grab PMID lists for all involved nodes involved_curies = {curie for node_pair in node_pairs_to_evaluate for curie in node_pair} canonicalized_curie_lookup = self._get_canonical_curies_map(list(involved_curies)) self.load_curie_to_pmids_data(canonicalized_curie_lookup.values()) added_flag = False # check to see if any edges where added self.response.debug(f"Looping through {len(node_pairs_to_evaluate)} node pairs and calculating NGD values") # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (subject_curie, object_curie) in node_pairs_to_evaluate: # create the edge attribute if it can be canonical_subject_curie = canonicalized_curie_lookup.get(subject_curie, subject_curie) canonical_object_curie = canonicalized_curie_lookup.get(object_curie, object_curie) ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie) if np.isfinite(ngd_value): # if ngd is finite, that's ok, otherwise, stay with default edge_value = ngd_value else: edge_value = default_value edge_attribute = EdgeAttribute(attribute_type_id=type, original_attribute_name=name, value=str(edge_value), value_url=url, description=ngd_description) # populate the NGD edge attribute pmid_attribute = EdgeAttribute(attribute_type_id="biolink:publications", original_attribute_name="publications", value=[f"PMID:{pmid}" for pmid in pmid_set]) if edge_attribute: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "biolink:has_normalized_google_distance_with" qedge_keys = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "infores:arax" confidence = None weight = None # TODO: could make the actual value of the attribute subject_key = subject_curie object_key = object_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" # ensure the id is unique # might need to change after expand is implemented for TRAPI 1.0 while id in self.message.knowledge_graph.edges: id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}" self.global_iter += 1 edge_attribute_list = [ edge_attribute, pmid_attribute, EdgeAttribute(original_attribute_name="virtual_relation_label", value=relation, attribute_type_id="biolink:Unknown"), #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"), EdgeAttribute(original_attribute_name="defined_datetime", value=defined_datetime, attribute_type_id="metatype:Datetime"), EdgeAttribute(original_attribute_name="provided_by", value=provided_by, attribute_type_id="biolink:aggregator_knowledge_source", attribute_source=provided_by, value_type_id="biolink:InformationResource"), EdgeAttribute(original_attribute_name=None, value=True, attribute_type_id="biolink:computed_value", attribute_source="infores:arax-reasoner-ara", value_type_id="metatype:Boolean", value_url=None, description="This edge is a container for a computed value between two nodes that is not directly attachable to other edges.") #EdgeAttribute(original_attribute_name="confidence", value=confidence, attribute_type_id="biolink:ConfidenceLevel"), #EdgeAttribute(original_attribute_name="weight", value=weight, attribute_type_id="metatype:Float"), #EdgeAttribute(original_attribute_name="qedge_keys", value=qedge_keys) ] # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key, # object_key=object_key, # is_defined_by=is_defined_by, defined_datetime=defined_datetime, # provided_by=provided_by, # confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids) #### FIXME temporary hack by EWD #edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation, # attributes=edge_attribute_list) edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, attributes=edge_attribute_list) #edge.relation = relation #### /end FIXME edge.qedge_keys = qedge_keys self.message.knowledge_graph.edges[id] = edge #FW: check if results exist then modify them with the ngd edge if self.message.results is not None and len(self.message.results) > 0: ou.update_results_with_overlay_edge(subject_knode_key=subject_key, object_knode_key=object_key, kedge_key=id, message=self.message, log=self.response) # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: #edge_type = parameters['virtual_edge_type'] edge_type = [ "biolink:has_normalized_google_distance_with" ] relation = parameters['virtual_relation_label'] option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, qg, self.response) # q_edge = QEdge(id=relation, type=edge_type, relation=relation, # subject_key=subject_qnode_key, object_key=object_qnode_key, # option_group_id=option_group_id) #### FIXME by EWD. For later fixing #q_edge = QEdge(predicates=edge_type, relation=relation, subject=subject_qnode_key, # object=object_qnode_key, option_group_id=option_group_id) q_edge = QEdge(predicates=edge_type, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) q_edge.relation = relation #### end FIXME self.message.query_graph.edges[relation]=q_edge self.response.info(f"NGD values successfully added to edges for the qnode pair ({subject_qnode_key},{object_qnode_key})") elif 'virtual_relation_label' in parameters: # Figure out which node pairs to compute NGD between subject_qnode_key = parameters['subject_qnode_key'] object_qnode_key = parameters['object_qnode_key'] node_pairs_to_evaluate = ou.get_node_pairs_to_overlay(subject_qnode_key, object_qnode_key, qg, kg, self.response) # Grab PMID lists for all involved nodes involved_curies = {curie for node_pair in node_pairs_to_evaluate for curie in node_pair} canonicalized_curie_lookup = self._get_canonical_curies_map(list(involved_curies)) self.load_curie_to_pmids_data(canonicalized_curie_lookup.values()) added_flag = False # check to see if any edges where added self.response.debug(f"Looping through {len(node_pairs_to_evaluate)} node pairs and calculating NGD values") # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (subject_curie, object_curie) in node_pairs_to_evaluate: # create the edge attribute if it can be canonical_subject_curie = canonicalized_curie_lookup.get(subject_curie, subject_curie) canonical_object_curie = canonicalized_curie_lookup.get(object_curie, object_curie) ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie) if np.isfinite(ngd_value): # if ngd is finite, that's ok, otherwise, stay with default edge_value = ngd_value else: edge_value = default_value edge_attribute = EdgeAttribute(attribute_type_id=type, original_attribute_name=name, value=str(edge_value), value_url=url, description=ngd_description) # populate the NGD edge attribute pmid_attribute = EdgeAttribute(attribute_type_id="biolink:publications", original_attribute_name="publications", value=[f"PMID:{pmid}" for pmid in pmid_set]) if edge_attribute: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "biolink:has_normalized_google_distance_with" qedge_keys = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "infores:arax" confidence = None weight = None # TODO: could make the actual value of the attribute subject_key = subject_curie object_key = object_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" # ensure the id is unique # might need to change after expand is implemented for TRAPI 1.0 while id in self.message.knowledge_graph.edges: id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}" self.global_iter += 1 edge_attribute_list = [ edge_attribute, pmid_attribute, EdgeAttribute(original_attribute_name="virtual_relation_label", value=relation, attribute_type_id="biolink:Unknown"), #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"), EdgeAttribute(original_attribute_name="defined_datetime", value=defined_datetime, attribute_type_id="metatype:Datetime"), EdgeAttribute(original_attribute_name="provided_by", value=provided_by, attribute_type_id="biolink:aggregator_knowledge_source", attribute_source=provided_by, value_type_id="biolink:InformationResource"), EdgeAttribute(original_attribute_name=None, value=True, attribute_type_id="biolink:computed_value", attribute_source="infores:arax-reasoner-ara", value_type_id="metatype:Boolean", value_url=None, description="This edge is a container for a computed value between two nodes that is not directly attachable to other edges.") #EdgeAttribute(original_attribute_name="confidence", value=confidence, attribute_type_id="biolink:ConfidenceLevel"), #EdgeAttribute(original_attribute_name="weight", value=weight, attribute_type_id="metatype:Float"), #EdgeAttribute(original_attribute_name="qedge_keys", value=qedge_keys) ] # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key, # object_key=object_key, # is_defined_by=is_defined_by, defined_datetime=defined_datetime, # provided_by=provided_by, # confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids) #### FIXME temporary hack by EWD #edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation, # attributes=edge_attribute_list) edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, attributes=edge_attribute_list) #edge.relation = relation #### /end FIXME edge.qedge_keys = qedge_keys self.message.knowledge_graph.edges[id] = edge #FW: check if results exist then modify them with the ngd edge if self.message.results is not None and len(self.message.results) > 0: ou.update_results_with_overlay_edge(subject_knode_key=subject_key, object_knode_key=object_key, kedge_key=id, message=self.message, log=self.response) # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: #edge_type = parameters['virtual_edge_type'] edge_type = [ "biolink:has_normalized_google_distance_with" ] relation = parameters['virtual_relation_label'] option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, qg, self.response) # q_edge = QEdge(id=relation, type=edge_type, relation=relation, # subject_key=subject_qnode_key, object_key=object_qnode_key, # option_group_id=option_group_id) #### FIXME by EWD. For later fixing #q_edge = QEdge(predicates=edge_type, relation=relation, subject=subject_qnode_key, # object=object_qnode_key, option_group_id=option_group_id) q_edge = QEdge(predicates=edge_type, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) q_edge.relation = relation #### end FIXME self.message.query_graph.edges[relation]=q_edge self.response.info(f"NGD values successfully added to edges") else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: # Map all nodes to their canonicalized curies in one batch (need canonical IDs for the local NGD system) canonicalized_curie_map = self._get_canonical_curies_map([key for key in self.message.knowledge_graph.nodes.keys()]) self.load_curie_to_pmids_data(canonicalized_curie_map.values()) self.response.debug(f"Looping through edges and calculating NGD values") for edge in self.message.knowledge_graph.edges.values(): # Make sure the attributes are not None if not edge.attributes: edge.attributes = [] # should be an array, but why not a list? # now go and actually get the NGD subject_curie = edge.subject object_curie = edge.object canonical_subject_curie = canonicalized_curie_map.get(subject_curie, subject_curie) canonical_object_curie = canonicalized_curie_map.get(object_curie, object_curie) ngd_value, pmid_set = self.calculate_ngd_fast(canonical_subject_curie, canonical_object_curie) if np.isfinite(ngd_value): # if ngd is finite, that's ok, otherwise, stay with default edge_value = ngd_value else: edge_value = default_value ngd_edge_attribute = EdgeAttribute(attribute_type_id=type, original_attribute_name=name, value=str(edge_value), value_url=url, description=ngd_description) # populate the NGD edge attribute pmid_edge_attribute = EdgeAttribute(attribute_type_id="biolink:publications", original_attribute_name="ngd_publications", value_type_id="EDAM:data_1187", value=[f"PMID:{pmid}" for pmid in pmid_set]) edge.attributes.append(ngd_edge_attribute) # append it to the list of attributes edge.attributes.append(pmid_edge_attribute) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong adding the NGD edge attributes") else: self.response.info(f"NGD values successfully added to edges") self._close_database() return self.response
def flip_edge(edge: Edge, new_predicate: str) -> Edge: edge.predicate = new_predicate original_subject = edge.subject edge.subject = edge.object edge.object = original_subject return edge
def add_virtual_edge(self, name="", default=0.): """ Generic function to add a virtual edge to the KG an QG :name: name of the functionality of the KP to use """ parameters = self.parameters subject_curies_to_decorate = set() object_curies_to_decorate = set() curies_to_names = dict( ) # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs # identify the nodes that we should be adding virtual edges for for key, node in self.message.knowledge_graph.nodes.items(): if hasattr(node, 'qnode_keys'): if parameters['subject_qnode_key'] in node.qnode_keys: subject_curies_to_decorate.add(key) curies_to_names[ key] = node.name # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs if parameters['object_qnode_key'] in node.qnode_keys: object_curies_to_decorate.add(key) curies_to_names[ key] = node.name # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs added_flag = False # check to see if any edges where added # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute ## call COHD api one time to save time curies_to_decorate = set() curies_to_decorate.update(subject_curies_to_decorate) curies_to_decorate.update(object_curies_to_decorate) self.mapping_curie_to_omop_ids = self.cohdIndex.get_concept_ids( curies_to_decorate) for (subject_curie, object_curie) in itertools.product(subject_curies_to_decorate, object_curies_to_decorate): # create the edge attribute if it can be edge_attribute = self.make_edge_attribute_from_curies( subject_curie, object_curie, subject_name=curies_to_names[subject_curie], object_name=curies_to_names[object_curie], default=default, name=name) if edge_attribute: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = f"biolink:has_real_world_evidence_of_association_with" qedge_keys = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "infores:arax" confidence = None weight = None # TODO: could make the actual value of the attribute subject_key = subject_curie object_key = object_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" # ensure the id is unique # might need to change after expand is implemented for TRAPI 1.0 while id in self.message.knowledge_graph.edges: id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}" self.global_iter += 1 edge_attribute_list = [ edge_attribute, EdgeAttribute( original_attribute_name="virtual_relation_label", value=relation, attribute_type_id="biolink:Unknown"), #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"), EdgeAttribute(original_attribute_name="defined_datetime", value=defined_datetime, attribute_type_id="metatype:Datetime"), EdgeAttribute( original_attribute_name="provided_by", value=provided_by, attribute_type_id="biolink:aggregator_knowledge_source", attribute_source=provided_by, value_type_id="biolink:InformationResource"), EdgeAttribute( original_attribute_name=None, value=True, attribute_type_id="biolink:computed_value", attribute_source="infores:arax-reasoner-ara", value_type_id="metatype:Boolean", value_url=None, description= "This edge is a container for a computed value between two nodes that is not directly attachable to other edges." ) #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"), #EdgeAttribute(name="weight", value=weight, type="metatype:Float"), #EdgeAttribute(name="qedge_ids", value=qedge_ids) ] # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key, # object_key=object_key, # is_defined_by=is_defined_by, defined_datetime=defined_datetime, # provided_by=provided_by, # confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids) edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, attributes=edge_attribute_list) edge.qedge_keys = qedge_keys self.message.knowledge_graph.edges[id] = edge if self.message.results is not None and len( self.message.results) > 0: ou.update_results_with_overlay_edge( subject_knode_key=subject_key, object_knode_key=object_key, kedge_key=id, message=self.message, log=self.response) # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: edge_type = f"biolink:has_real_world_evidence_of_association_with" relation = parameters['virtual_relation_label'] qedge_keys = [parameters['virtual_relation_label']] subject_qnode_key = parameters['subject_qnode_key'] object_qnode_key = parameters['object_qnode_key'] option_group_id = ou.determine_virtual_qedge_option_group( subject_qnode_key, object_qnode_key, self.message.query_graph, self.response) # q_edge = QEdge(id=relation, type=edge_type, relation=relation, # subject_key=subject_qnode_key, object_key=object_qnode_key, # option_group_id=option_group_id) # TODO: ok to make the id and type the same thing? q_edge = QEdge(predicates=edge_type, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) q_edge.relation = relation self.message.query_graph.edges[relation] = q_edge
def _remap_edge(edge: Edge, new_curie: str, old_curie: str) -> Edge: if edge.subject == new_curie: edge.subject = old_curie if edge.object == new_curie: edge.object = old_curie return edge