def _convert_kg2_edge_to_swagger_edge(self, neo4j_edge): swagger_edge = Edge() swagger_edge.type = neo4j_edge.get('simplified_edge_label') swagger_edge.source_id = neo4j_edge.get('subject') swagger_edge.target_id = neo4j_edge.get('object') swagger_edge.id = self._create_edge_id(swagger_edge) swagger_edge.relation = neo4j_edge.get('relation') swagger_edge.publications = ast.literal_eval( neo4j_edge.get('publications')) swagger_edge.provided_by = self._convert_strange_provided_by_field_to_list( neo4j_edge.get('provided_by') ) # Temporary hack until provided_by is fixed in KG2 swagger_edge.negated = ast.literal_eval(neo4j_edge.get('negated')) swagger_edge.is_defined_by = "ARAX/KG2" swagger_edge.edge_attributes = [] # Add additional properties on KG2 edges as swagger EdgeAttribute objects # TODO: fix issues coming from strange characters in 'publications_info'! (EOF error) additional_kg2_edge_properties = [ 'relation_curie', 'simplified_relation_curie', 'simplified_relation', 'edge_label' ] edge_attributes = self._create_swagger_attributes( "edge", additional_kg2_edge_properties, neo4j_edge) swagger_edge.edge_attributes += edge_attributes return swagger_edge
def _convert_kg1_edge_to_swagger_edge(self, neo4j_edge: Dict[str, any], node_uuid_to_curie_dict: Dict[str, str]) -> Edge: swagger_edge = Edge() swagger_edge.type = neo4j_edge.get("predicate") swagger_edge.source_id = node_uuid_to_curie_dict[neo4j_edge.get("source_node_uuid")] swagger_edge.target_id = node_uuid_to_curie_dict[neo4j_edge.get("target_node_uuid")] swagger_edge.id = f"KG1:{neo4j_edge.get('id')}" swagger_edge.relation = neo4j_edge.get("relation") swagger_edge.provided_by = neo4j_edge.get("provided_by") swagger_edge.is_defined_by = "ARAX/KG1" if neo4j_edge.get("probability"): swagger_edge.edge_attributes = self._create_swagger_attributes("edge", ["probability"], neo4j_edge) return swagger_edge
def _create_ngd_edge(self, ngd_value: float, source_id: str, target_id: str) -> Edge: ngd_edge = Edge() ngd_edge.type = self.ngd_edge_type ngd_edge.source_id = source_id ngd_edge.target_id = target_id ngd_edge.id = f"NGD:{source_id}--{ngd_edge.type}--{target_id}" ngd_edge.provided_by = "ARAX" ngd_edge.is_defined_by = "ARAX" ngd_edge.edge_attributes = [ EdgeAttribute(name=self.ngd_edge_attribute_name, type=self.ngd_edge_attribute_type, value=ngd_value, url=self.ngd_edge_attribute_url) ] return ngd_edge
def _convert_kg1_edge_to_swagger_edge(self, neo4j_edge, node_uuid_to_curie_dict): swagger_edge = Edge() swagger_edge.type = neo4j_edge.get('predicate') swagger_edge.source_id = node_uuid_to_curie_dict[neo4j_edge.get( 'source_node_uuid')] swagger_edge.target_id = node_uuid_to_curie_dict[neo4j_edge.get( 'target_node_uuid')] swagger_edge.id = self._create_edge_id(swagger_edge) swagger_edge.relation = neo4j_edge.get('relation') swagger_edge.provided_by = neo4j_edge.get('provided_by') swagger_edge.is_defined_by = "ARAX/KG1" if neo4j_edge.get('probability'): swagger_edge.edge_attributes = self._create_swagger_attributes( "edge", ['probability'], neo4j_edge) return swagger_edge
def _convert_kg2_edge_to_swagger_edge(self, neo4j_edge: Dict[str, any]) -> Edge: swagger_edge = Edge() swagger_edge.id = f"KG2:{neo4j_edge.get('id')}" swagger_edge.type = neo4j_edge.get("simplified_edge_label") swagger_edge.source_id = neo4j_edge.get("subject") swagger_edge.target_id = neo4j_edge.get("object") swagger_edge.relation = neo4j_edge.get("relation") swagger_edge.publications = ast.literal_eval(neo4j_edge.get("publications")) swagger_edge.provided_by = self._convert_strange_provided_by_field_to_list(neo4j_edge.get("provided_by")) # Temporary hack until provided_by is fixed in KG2 swagger_edge.negated = ast.literal_eval(neo4j_edge.get("negated")) swagger_edge.is_defined_by = "ARAX/KG2" swagger_edge.edge_attributes = [] # Add additional properties on KG2 edges as swagger EdgeAttribute objects # TODO: fix issues coming from strange characters in 'publications_info'! (EOF error) additional_kg2_edge_properties = ["relation_curie", "simplified_relation_curie", "simplified_relation", "edge_label"] edge_attributes = self._create_swagger_attributes("edge", additional_kg2_edge_properties, neo4j_edge) swagger_edge.edge_attributes += edge_attributes return swagger_edge
def predict_drug_treats_disease(self): """ Iterate over all the edges in the knowledge graph, add the drug-disease treatment probability for appropriate edges on the edge_attributes :return: response """ parameters = self.parameters self.response.debug(f"Computing drug disease treatment probability based on a machine learning model") self.response.info(f"Computing drug disease treatment probability based on a machine learning model: See [this publication](https://doi.org/10.1101/765305) for more details about how this is accomplished.") attribute_name = "probability_treats" attribute_type = "EDAM:data_0951" value = 0 # this will be the default value. If the model returns 0, or the default is there, don't include that edge url = "https://doi.org/10.1101/765305" # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them if 'virtual_relation_label' in parameters: source_curies_to_decorate = set() target_curies_to_decorate = set() # identify the nodes that we should be adding virtual edges for for node in self.message.knowledge_graph.nodes: if hasattr(node, 'qnode_ids'): if parameters['source_qnode_id'] in node.qnode_ids: if "drug" in node.type or "chemical_substance" in node.type: # this is now NOT checked by ARAX_overlay source_curies_to_decorate.add(node.id) if parameters['target_qnode_id'] in node.qnode_ids: if "disease" in node.type or "phenotypic_feature" in node.type: # this is now NOT checked by ARAX_overlay target_curies_to_decorate.add(node.id) added_flag = False # check to see if any edges where added # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate): # create the edge attribute if it can be # loop over all equivalent curies and take the highest probability max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_source_curie is None or converted_target_curie is None: continue res = list(itertools.product(converted_source_curie, converted_target_curie)) if len(res) != 0: all_probabilities = self.pred.prob_all(res) if isinstance(all_probabilities, list): max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url) # populate the edge attribute if edge_attribute and value != 0: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "probably_treats" qedge_ids = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute source_id = source_curie target_id = target_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" self.global_iter += 1 edge = Edge(id=id, type=edge_type, relation=relation, source_id=source_id, target_id=target_id, is_defined_by=is_defined_by, defined_datetime=defined_datetime, provided_by=provided_by, confidence=confidence, weight=weight, edge_attributes=[edge_attribute], qedge_ids=qedge_ids) self.message.knowledge_graph.edges.append(edge) # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: edge_type = "probably_treats" relation = parameters['virtual_relation_label'] qedge_id = parameters['virtual_relation_label'] q_edge = QEdge(id=relation, type=edge_type, relation=relation, source_id=parameters['source_qnode_id'], target_id=parameters['target_qnode_id']) # TODO: ok to make the id and type the same thing? self.message.query_graph.edges.append(q_edge) return self.response else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: # map curies to types curie_to_type = dict() for node in self.message.knowledge_graph.nodes: curie_to_type[node.id] = node.type # then iterate over the edges and decorate if appropriate for edge in self.message.knowledge_graph.edges: # Make sure the edge_attributes are not None if not edge.edge_attributes: edge.edge_attributes = [] # should be an array, but why not a list? # now go and actually get the NGD source_curie = edge.source_id target_curie = edge.target_id source_types = curie_to_type[source_curie] target_types = curie_to_type[target_curie] if (("drug" in source_types) or ("chemical_substance" in source_types)) and (("disease" in target_types) or ("phenotypic_feature" in target_types)): temp_value = 0 # loop over all pairs of equivalent curies and take the highest probability max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_source_curie is None or converted_target_curie is None: continue res = list(itertools.product(converted_source_curie, converted_target_curie)) if len(res) != 0: all_probabilities = self.pred.prob_all(res) if isinstance(all_probabilities, list): max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] elif (("drug" in target_types) or ("chemical_substance" in target_types)) and (("disease" in source_types) or ("phenotypic_feature" in source_types)): #probability = self.pred.prob_single('ChEMBL:' + target_curie[22:], source_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_source_curie is None or converted_target_curie is None: continue res = list(itertools.product(converted_target_curie, converted_source_curie)) if len(res) != 0: all_probabilities = self.pred.prob_all(res) if isinstance(all_probabilities, list): max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability else: continue if value != 0: edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url) # populate the attribute edge.edge_attributes.append(edge_attribute) # append it to the list of attributes except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong adding the drug disease treatment probability") else: self.response.info(f"Drug disease treatment probability successfully added to edges") return self.response
def compute_ngd(self): """ Iterate over all the edges in the knowledge graph, compute the normalized google distance and stick that info on the edge_attributes :default: The default value to set for NGD if it returns a nan :return: response """ if self.response.status != 'OK': # Catches any errors that may have been logged during initialization self._close_database() return self.response parameters = self.parameters self.response.debug(f"Computing NGD") self.response.info( f"Computing the normalized Google distance: weighting edges based on source/target node " f"co-occurrence frequency in PubMed abstracts") self.response.info( "Converting CURIE identifiers to human readable names") node_curie_to_name = dict() try: for node in self.message.knowledge_graph.nodes: node_curie_to_name[node.id] = node.name except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(f"Something went wrong when converting names") self.response.error(tb, error_code=error_type.__name__) name = "normalized_google_distance" type = "EDAM:data_2526" value = self.parameters['default_value'] url = "https://arax.rtx.ai/api/rtx/v1/ui/#/PubmedMeshNgd" # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them if 'virtual_relation_label' in parameters: source_curies_to_decorate = set() target_curies_to_decorate = set() curies_to_names = dict() # identify the nodes that we should be adding virtual edges for for node in self.message.knowledge_graph.nodes: if hasattr(node, 'qnode_ids'): if parameters['source_qnode_id'] in node.qnode_ids: source_curies_to_decorate.add(node.id) curies_to_names[node.id] = node.name if parameters['target_qnode_id'] in node.qnode_ids: target_curies_to_decorate.add(node.id) curies_to_names[node.id] = node.name # Convert these curies to their canonicalized curies (needed for the local NGD system) canonicalized_curie_map = self._get_canonical_curies_map( list(source_curies_to_decorate.union( target_curies_to_decorate))) self.load_curie_to_pmids_data(canonicalized_curie_map.values()) added_flag = False # check to see if any edges where added num_computed_total = 0 num_computed_slow = 0 self.response.debug( f"Looping through node pairs and calculating NGD values") # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate): # create the edge attribute if it can be source_name = curies_to_names[source_curie] target_name = curies_to_names[target_curie] num_computed_total += 1 canonical_source_curie = canonicalized_curie_map.get( source_curie, source_curie) canonical_target_curie = canonicalized_curie_map.get( target_curie, target_curie) ngd_value = self.calculate_ngd_fast(canonical_source_curie, canonical_target_curie) if ngd_value is None: ngd_value = self.NGD.get_ngd_for_all( [source_curie, target_curie], [source_name, target_name]) self.response.debug( f"Had to use eUtils to compute NGD between {source_name} " f"({canonical_source_curie}) and {target_name} ({canonical_target_curie}). " f"Value is: {ngd_value}") num_computed_slow += 1 if np.isfinite( ngd_value ): # if ngd is finite, that's ok, otherwise, stay with default value = ngd_value edge_attribute = EdgeAttribute( type=type, name=name, value=str(value), url=url) # populate the NGD edge attribute if edge_attribute: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "has_normalized_google_distance_with" qedge_ids = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute source_id = source_curie target_id = target_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" self.global_iter += 1 edge = Edge(id=id, type=edge_type, relation=relation, source_id=source_id, target_id=target_id, is_defined_by=is_defined_by, defined_datetime=defined_datetime, provided_by=provided_by, confidence=confidence, weight=weight, edge_attributes=[edge_attribute], qedge_ids=qedge_ids) self.message.knowledge_graph.edges.append(edge) # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: #edge_type = parameters['virtual_edge_type'] edge_type = "has_normalized_google_distance_with" relation = parameters['virtual_relation_label'] q_edge = QEdge(id=relation, type=edge_type, relation=relation, source_id=parameters['source_qnode_id'], target_id=parameters['target_qnode_id']) self.message.query_graph.edges.append(q_edge) self.response.info(f"NGD values successfully added to edges") num_computed_fast = num_computed_total - num_computed_slow percent_computed_fast = round( (num_computed_fast / num_computed_total) * 100) self.response.debug( f"Used fastNGD for {percent_computed_fast}% of edges " f"({num_computed_fast} of {num_computed_total})") else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: # Map all nodes to their canonicalized curies in one batch (need canonical IDs for the local NGD system) canonicalized_curie_map = self._get_canonical_curies_map( [node.id for node in self.message.knowledge_graph.nodes]) self.load_curie_to_pmids_data(canonicalized_curie_map.values()) num_computed_total = 0 num_computed_slow = 0 self.response.debug( f"Looping through edges and calculating NGD values") for edge in self.message.knowledge_graph.edges: # Make sure the edge_attributes are not None if not edge.edge_attributes: edge.edge_attributes = [ ] # should be an array, but why not a list? # now go and actually get the NGD source_curie = edge.source_id target_curie = edge.target_id source_name = node_curie_to_name[source_curie] target_name = node_curie_to_name[target_curie] num_computed_total += 1 canonical_source_curie = canonicalized_curie_map.get( source_curie, source_curie) canonical_target_curie = canonicalized_curie_map.get( target_curie, target_curie) ngd_value = self.calculate_ngd_fast( canonical_source_curie, canonical_target_curie) if ngd_value is None: ngd_value = self.NGD.get_ngd_for_all( [source_curie, target_curie], [source_name, target_name]) self.response.debug( f"Had to use eUtils to compute NGD between {source_name} " f"({canonical_source_curie}) and {target_name} ({canonical_target_curie}). " f"Value is: {ngd_value}") num_computed_slow += 1 if np.isfinite( ngd_value ): # if ngd is finite, that's ok, otherwise, stay with default value = ngd_value ngd_edge_attribute = EdgeAttribute( type=type, name=name, value=str(value), url=url) # populate the NGD edge attribute edge.edge_attributes.append( ngd_edge_attribute ) # append it to the list of attributes except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error( f"Something went wrong adding the NGD edge attributes") else: self.response.info(f"NGD values successfully added to edges") num_computed_fast = num_computed_total - num_computed_slow percent_computed_fast = round( (num_computed_fast / num_computed_total) * 100) self.response.debug( f"Used fastNGD for {percent_computed_fast}% of edges " f"({num_computed_fast} of {num_computed_total})") self._close_database() return self.response
def compute_ngd(self): """ Iterate over all the edges in the knowledge graph, compute the normalized google distance and stick that info on the edge_attributes :default: The default value to set for NGD if it returns a nan :return: response """ parameters = self.parameters self.response.debug(f"Computing NGD") self.response.info(f"Computing the normalized Google distance: weighting edges based on source/target node " f"co-occurrence frequency in PubMed abstracts") self.response.info("Converting CURIE identifiers to human readable names") node_curie_to_name = dict() try: for node in self.message.knowledge_graph.nodes: node_curie_to_name[node.id] = node.name except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(f"Something went wrong when converting names") self.response.error(tb, error_code=error_type.__name__) self.response.warning(f"Utilizing API calls to NCBI eUtils, so this may take a while...") name = "normalized_google_distance" type = "data:2526" value = self.parameters['default_value'] url = "https://arax.rtx.ai/api/rtx/v1/ui/#/PubmedMeshNgd" ngd_method_counts = {"fast": 0, "slow": 0} # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them if 'virtual_relation_label' in parameters: source_curies_to_decorate = set() target_curies_to_decorate = set() curies_to_names = dict() # identify the nodes that we should be adding virtual edges for for node in self.message.knowledge_graph.nodes: if hasattr(node, 'qnode_ids'): if parameters['source_qnode_id'] in node.qnode_ids: source_curies_to_decorate.add(node.id) curies_to_names[node.id] = node.name if parameters['target_qnode_id'] in node.qnode_ids: target_curies_to_decorate.add(node.id) curies_to_names[node.id] = node.name added_flag = False # check to see if any edges where added # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate): # create the edge attribute if it can be source_name = curies_to_names[source_curie] target_name = curies_to_names[target_curie] self.response.debug(f"Computing NGD between {source_name} and {target_name}") ngd_value, method_used = self.NGD.get_ngd_for_all_fast([source_curie, target_curie], [source_name, target_name]) ngd_method_counts[method_used] += 1 if np.isfinite(ngd_value): # if ngd is finite, that's ok, otherwise, stay with default value = ngd_value edge_attribute = EdgeAttribute(type=type, name=name, value=str(value), url=url) # populate the NGD edge attribute if edge_attribute: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "has_normalized_google_distance_with" qedge_ids = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute source_id = source_curie target_id = target_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" self.global_iter += 1 edge = Edge(id=id, type=edge_type, relation=relation, source_id=source_id, target_id=target_id, is_defined_by=is_defined_by, defined_datetime=defined_datetime, provided_by=provided_by, confidence=confidence, weight=weight, edge_attributes=[edge_attribute], qedge_ids=qedge_ids) self.message.knowledge_graph.edges.append(edge) # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: #edge_type = parameters['virtual_edge_type'] edge_type = "has_normalized_google_distance_with" relation = parameters['virtual_relation_label'] q_edge = QEdge(id=relation, type=edge_type, relation=relation, source_id=parameters['source_qnode_id'], target_id=parameters[ 'target_qnode_id']) self.message.query_graph.edges.append(q_edge) else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: for edge in self.message.knowledge_graph.edges: # Make sure the edge_attributes are not None if not edge.edge_attributes: edge.edge_attributes = [] # should be an array, but why not a list? # now go and actually get the NGD source_curie = edge.source_id target_curie = edge.target_id source_name = node_curie_to_name[source_curie] target_name = node_curie_to_name[target_curie] ngd_value, method_used = self.NGD.get_ngd_for_all_fast([source_curie, target_curie], [source_name, target_name]) ngd_method_counts[method_used] += 1 if np.isfinite(ngd_value): # if ngd is finite, that's ok, otherwise, stay with default value = ngd_value ngd_edge_attribute = EdgeAttribute(type=type, name=name, value=str(value), url=url) # populate the NGD edge attribute edge.edge_attributes.append(ngd_edge_attribute) # append it to the list of attributes except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong adding the NGD edge attributes") else: self.response.info(f"NGD values successfully added to edges") self.response.debug(f"Used fast NGD for {ngd_method_counts['fast']} edges, back-up NGD method for {ngd_method_counts['slow']}") return self.response