def answer(source_list, source_type, target_type, use_json=False, num_show=20, rel_type=None): """ Answers the question 'what pathways are most enriched by $protein_list?' :param source_list: A list of source node ids :param source_type: The source node label :param target_type: The target node label :param use_json: bool, use JSON output :param num_show: int, number to display :return: none """ if RU.does_connect(source_list, source_type, target_type) != 1: error_message = "I found no %s connected to any element of %s" % ( target_type, str(source_list)) if not use_json: print(error_message) return else: error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) response.print() return (target_dict, target_list) = RU.top_n_fisher_exact(source_list, source_type, target_type, n=num_show, rel_type=rel_type) target_list.reverse() return (target_dict, target_list)
def other_connection_types(): # Besides direct disease->phenotype connections, here is a list of other possible connections # one is parent of print("one") node_label_list = [disease_label, "phenotypic_feature"] relationship_label_list = ["subclass_of", "has_phenotype", "has_phenotype"] node_of_interest_position = 1 print( RU.count_nodes_of_type_on_path_of_type_to_label( disease_ID, disease_label, target_label, node_label_list, relationship_label_list, node_of_interest_position, debug=True)) names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label( disease_ID, disease_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) for ID in names2counts.keys(): if names2counts[ID] / float( len(disease_phenotypes_set) ) >= threshold: # if it's below this threshold, no way the Jaccard index will be large enough other_disease_IDs_to_intersection_counts[ID] = names2counts[ID] # other is parent of print("other") node_label_list = ["phenotypic_feature", target_label] relationship_label_list = ["has_phenotype", "has_phenotype", "subclass_of"] node_of_interest_position = 0 names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label( disease_ID, disease_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) for ID in names2counts.keys(): if names2counts[ID] / float( len(disease_phenotypes_set) ) >= threshold: # if it's below this threshold, no way the Jaccard index will be large enough other_disease_IDs_to_intersection_counts[ID] = names2counts[ID] # Both is parent of print("both") node_label_list = [disease_label, "phenotypic_feature", target_label] relationship_label_list = [ "subclass_of", "has_phenotype", "has_phenotype", "subclass_of" ] node_of_interest_position = 1 names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label( disease_ID, disease_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) for ID in names2counts.keys(): if names2counts[ID] / float( len(disease_phenotypes_set) ) >= threshold: # if it's below this threshold, no way the Jaccard index will be large enough other_disease_IDs_to_intersection_counts[ID] = names2counts[ID]
def restate_question(self, input_parameters): """ Restates a question. :param parameters: a dictionary with keys given by self.parameters.keys() :return: string """ # First, get rid of the Nones since they substitute in an ugly way parameters = dict() for key, value in input_parameters.items(): if value is not None: parameters[key] = value # Try to get the description of each node parameters_as_descriptions = dict() if parameters: for parameter in parameters: try: description = RU.get_node_property(parameters[parameter], 'description') except: description = parameters[parameter] parameters_as_descriptions[parameter] = description # Lastly, make the template substitution if parameters_as_descriptions: restated = self.restated_question_template.safe_substitute(parameters_as_descriptions) else: restated = self.restated_question_template.safe_substitute({}) return restated
def describe(self): output = "Answers questions of the form: 'What proteins does tranilast target?' and 'What genes are affected by " \ "Fanconi anemia?'" + "\n" output += "You can ask: 'What X does Y Z?' where X is one of the following: \n" for label in RU.get_node_labels(): output = output + label + "\n" output += "\n The term Y is any of the nodes that are in our graph (currently " + str( RU.count_nodes()) + " nodes in total). \n" output += "\n The term Z is any relationship of the following kind: \n" for rel in RU.get_relationship_types(): rel_split = rel.split("_") for term in rel_split: output += term + " " output += "\n" output += "Assumes that Z directly connects X and Y." return output
def test_correct_question(): """ Point of this test is to form a bunch of sentences, match them against all queries, and make sure the correct question template is matched :return: None """ # get a random selection of nodes property_to_nodes = dict() for label in RU.get_node_labels(): nodes = RU.get_random_nodes(label, property="description") property_to_nodes[label] = nodes # import the questions questions = [] with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'Questions.tsv'), 'r') as fid: for line in fid.readlines(): if line[0] == "#": pass else: questions.append(Question(line)) # form the corpora corpora = [q.corpus for q in questions] for q in questions: # populate the sentence template parameters = dict() # ignore the what is question if q.parameter_names and q.parameter_names[0] != "term": for label in q.parameter_names: node = random.choice(property_to_nodes[label]) parameters[label] = node input_sentence = q.restate_question(parameters) input_sentence = input_sentence.strip(string.punctuation) # Run it against all the questions (corpus_index, similarity) = wd.find_corpus(input_sentence, corpora) if questions[corpus_index].restated_question_template.template != q.restated_question_template.template: temp_parameters = questions[corpus_index].get_parameters(input_sentence) # test if the parameters were populated if all([val is not None for val in temp_parameters.values()]): print("Bad classification! input: %s\n matched template: %s" % (input_sentence, questions[corpus_index].restated_question_template.template)) print(questions[corpus_index].get_parameters(input_sentence))
def get_similar_nodes_in_common_parameters(node_ID, target_node_label, association_node_label): """ This function will get the parameters for get_similar_nodes_in_common based on target node, target label, and association label :param node_ID: source node ID (name in KG) :param target_label: the node types that you want returned :param association_node_label: the association node (node in common between source and target) type :return: dict, error_code, error_message (dict keys input_node_ID, input_node_label, association_node_label, input_association_relationship, target_association_relationship, target_node_label) """ # Check if node exists if not RU.node_exists_with_property(node_ID, 'id'): error_message = "Sorry, the disease %s is not yet in our knowledge graph." % node_ID error_code = "DiseaseNotFound" return dict(), error_code, error_message # Get label/kind of node the source is input_node_label = RU.get_node_property(node_ID, "label") input_node_ID = node_ID # Get relationship between source and association label rels = RU.get_relationship_types_between(input_node_ID, input_node_label, "", association_node_label, max_path_len=1) # TODO: there could be multiple relationship types, for now, let's just pop one if not rels: error_code = "NoRelationship" error_message = "Sorry, the %s %s is not connected to any %s." % (input_node_label, input_node_ID, association_node_label) parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent of %s, so you might try that instead." % ( RU.get_node_property(parent, 'name'), RU.get_node_property(input_node_ID, 'name')) return dict(), error_code, error_message input_association_relationship = rels.pop() # Get relationship between target and association label rels = RU.get_relationship_types_between("", target_node_label, "", association_node_label, max_path_len=1) if not rels: error_code = "NoRelationship" error_message = "Sorry, no %s is not connected to any %s." % (target_node_label, association_node_label) return dict(), error_code, error_message target_association_relationship = rels.pop() # TODO: kludgy fix for microRNA's having multiple relationship types, only one of which shows up frequently if target_association_relationship == "gene_mutations_contribute_to": target_association_relationship = "gene_associated_with_condition" # populate the arguments arguments = dict(input_node_ID=input_node_ID, input_node_label=input_node_label, association_node_label=association_node_label, input_association_relationship=input_association_relationship, target_association_relationship=target_association_relationship, target_node_label=target_node_label) return arguments, None, None
def answer(source_node_ID, target_node_type, association_node_type, use_json=False, threshold=0.2, n=20): """ Answers the question what X are similar to Y based on overlap of common Z nodes. X is target_node_type, Y is source_node_ID, Z is association_node_type. The relationships are automatically determined in SimilarNodesInCommon by looking for 1 hop relationships and poping the FIRST one (you are warned). :param source_node_ID: actual name in the KG :param target_node_type: kinds of nodes you want returned :param association_node_type: kind of node you are computing the Jaccard overlap on :param use_json: print the results in standardized format :param threshold: only return results where jaccard is >= this threshold :param n: number of results to return (default 20) :return: reponse (or printed text) """ # Initialize the response class response = FormatOutput.FormatResponse(5) # add the column names for the row data response.message.table_column_names = [ "source name", "source ID", "target name", "target ID", "Jaccard index" ] # Initialize the similar nodes class similar_nodes_in_common = SimilarNodesInCommon.SimilarNodesInCommon() # get the description source_node_description = RU.get_node_property(source_node_ID, 'name') # get the source node label source_node_label = RU.get_node_property(source_node_ID, 'label') # Get the nodes in common node_jaccard_tuples_sorted, error_code, error_message = similar_nodes_in_common.get_similar_nodes_in_common_source_target_association( source_node_ID, target_node_type, association_node_type, threshold) # reduce to top 100 if len(node_jaccard_tuples_sorted) > n: node_jaccard_tuples_sorted = node_jaccard_tuples_sorted[0:n] # make sure that the input node isn't in the list node_jaccard_tuples_sorted = [ i for i in node_jaccard_tuples_sorted if i[0] != source_node_ID ] # check for an error if error_code is not None or error_message is not None: if not use_json: print(error_message) return else: response.add_error_message(error_code, error_message) response.print() return #### If use_json not specified, then return results as a fairly plain list if not use_json: to_print = "The %s's involving similar %ss as %s are: \n" % ( target_node_type, association_node_type, source_node_description) for other_disease_ID, jaccard in node_jaccard_tuples_sorted: to_print += "%s\t%s\tJaccard %f\n" % ( other_disease_ID, RU.get_node_property(other_disease_ID, 'name'), jaccard) print(to_print) #### Else if use_json requested, return the results in the Translator standard API JSON format else: #### Create the QueryGraph for this type of question query_graph = QueryGraph() source_node = QNode() source_node.id = "n00" source_node.curie = source_node_ID source_node.type = source_node_label association_node = QNode() association_node.id = "n01" association_node.type = association_node_type association_node.is_set = True target_node = QNode() target_node.id = "n02" target_node.type = target_node_type query_graph.nodes = [source_node, association_node, target_node] #source_association_relationship_type = "unknown1" edge1 = QEdge() edge1.id = "en00-n01" edge1.source_id = "n00" edge1.target_id = "n01" #edge1.type = source_association_relationship_type #association_target_relationship_type = "unknown2" edge2 = QEdge() edge2.id = "en01-n02" edge2.source_id = "n01" edge2.target_id = "n02" #edge2.type = association_target_relationship_type query_graph.edges = [edge1, edge2] #### DONT Suppress the query_graph because we can now do the knowledge_map with v0.9.1 response.message.query_graph = query_graph #### Create a mapping dict with the source curie and node types and edge types. This dict is used for reverse lookups by type #### for mapping to the QueryGraph. There is a potential point of failure here if there are duplicate node or edge types. FIXME response._type_map = dict() response._type_map[source_node.curie] = source_node.id response._type_map[association_node.type] = association_node.id response._type_map[target_node.type] = target_node.id response._type_map["e" + edge1.source_id + "-" + edge1.target_id] = edge1.id response._type_map["e" + edge2.source_id + "-" + edge2.target_id] = edge2.id #### Extract the sorted IDs from the list of tuples node_jaccard_ID_sorted = [ id for id, jac in node_jaccard_tuples_sorted ] # print(RU.return_subgraph_through_node_labels(source_node_ID, source_node_label, node_jaccard_ID_sorted, target_node_type, # [association_node_type], with_rel=[], directed=True, debug=True)) # get the entire subgraph g = RU.return_subgraph_through_node_labels(source_node_ID, source_node_label, node_jaccard_ID_sorted, target_node_type, [association_node_type], with_rel=[], directed=False, debug=False) # extract the source_node_number for node, data in g.nodes(data=True): if data['properties']['id'] == source_node_ID: source_node_number = node break # Get all the target numbers target_id2numbers = dict() node_jaccard_ID_sorted_set = set(node_jaccard_ID_sorted) for node, data in g.nodes(data=True): if data['properties']['id'] in node_jaccard_ID_sorted_set: target_id2numbers[data['properties']['id']] = node for other_disease_ID, jaccard in node_jaccard_tuples_sorted: target_name = RU.get_node_property(other_disease_ID, 'name') to_print = "The %s %s involves similar %ss as %s with similarity value %f" % ( target_node_type, target_name, association_node_type, source_node_description, jaccard) # get all the shortest paths between source and target all_paths = nx.all_shortest_paths( g, source_node_number, target_id2numbers[other_disease_ID]) # get all the nodes on these paths #try: if 1 == 1: rel_nodes = set() for path in all_paths: for node in path: rel_nodes.add(node) if rel_nodes: # extract the relevant subgraph sub_g = nx.subgraph(g, rel_nodes) # add it to the response res = response.add_subgraph(sub_g.nodes(data=True), sub_g.edges(data=True), to_print, jaccard, return_result=True) res.essence = "%s" % target_name # populate with essence of question result res.essence_type = target_node_type row_data = [] # initialize the row data row_data.append("%s" % source_node_description) row_data.append("%s" % source_node_ID) row_data.append("%s" % target_name) row_data.append("%s" % other_disease_ID) row_data.append("%f" % jaccard) res.row_data = row_data # except: # pass response.print()
def old_answer(disease_ID, use_json=False, threshold=0.2): # This is about 5 times slower than the current answer, but is a bit clearer in how it's coded # Initialize the response class response = FormatOutput.FormatResponse(4) # Check if node exists if not RU.node_exists_with_property(disease_ID, 'name'): error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID error_code = "DiseaseNotFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # Get label/kind of node the source is disease_label = RU.get_node_property(disease_ID, "label") if disease_label != "disease" and disease_label != "disease": error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \ " Please try a different term" % disease_label error_code = "NotADisease" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # get the description disease_description = RU.get_node_property(disease_ID, 'description') # get the phenotypes associated to the disease disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID, "phenotypic_feature", "has_phenotype") # Look more steps beyond if we didn't get any physically_interacts_with if disease_phenotypes == []: for max_path_len in range(2, 5): disease_phenotypes = RU.get_node_names_of_type_connected_to_target( disease_label, disease_ID, "phenotypic_feature", max_path_len=max_path_len, direction="u") if disease_phenotypes: break # print("Total of %d phenotypes" % len(disease_phenotypes)) # Make sure you actually picked up at least one phenotype if not disease_phenotypes: error_message = "No phenotypes found for this disease." error_code = "NoPhenotypesFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 disease_phenotypes_set = set(disease_phenotypes) # get all the other disease that connect and get the phenotypes in common other_disease_IDs_to_intersection_counts = dict() for target_label in ["disease", "disease"]: # direct connection # print("direct") node_label_list = ["phenotypic_feature"] relationship_label_list = ["has_phenotype", "has_phenotype"] node_of_interest_position = 0 names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label( disease_ID, disease_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) for ID in names2counts.keys(): if names2counts[ID] / float( len(disease_phenotypes_set) ) >= threshold: # if it's below this threshold, no way the Jaccard index will be large enough other_disease_IDs_to_intersection_counts[ID] = names2counts[ID] if not other_disease_IDs_to_intersection_counts: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of").pop() if parent: error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # print("Total number of other diseases %d" % len(list(other_disease_IDs_to_intersection_counts.keys()))) # Now for each of the diseases in here, compute the actual Jaccard index disease_jaccard_tuples = [] # i = 0 for other_disease_ID in other_disease_IDs_to_intersection_counts.keys(): # print(i) # i += 1 # print(other_disease_ID) # get the phenotypes associated to the disease if other_disease_ID.split(":")[0] == "DOID": other_disease_label = "disease" if other_disease_ID.split(":")[0] == "OMIM": other_disease_label = "disease" other_disease_phenotypes = RU.get_one_hop_target( other_disease_label, other_disease_ID, "phenotypic_feature", "has_phenotype") # Look more steps beyond if we didn't get any physically_interacts_with if other_disease_phenotypes == []: for max_path_len in range(2, 5): other_disease_phenotypes = RU.get_node_names_of_type_connected_to_target( other_disease_label, other_disease_ID, "phenotypic_feature", max_path_len=max_path_len, direction="u") if other_disease_phenotypes: break # compute the Jaccard index if not other_disease_phenotypes: jaccard = 0 else: other_disease_phenotypes_set = set(other_disease_phenotypes) jaccard = other_disease_IDs_to_intersection_counts[ other_disease_ID] / float( len( list( disease_phenotypes_set.union( other_disease_phenotypes_set)))) # print("jaccard %f" % jaccard) if jaccard > threshold: disease_jaccard_tuples.append((other_disease_ID, jaccard)) # Format the results. # Maybe nothing passed the threshold if not disease_jaccard_tuples: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of") if parent: error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) return 1 # Otherwise there are results to return, first sort them largest to smallest disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted( disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)] if not use_json: to_print = "The diseases similar to %s are: \n" % disease_description for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print += "%s\t%s\tJaccard %f\n" % ( other_disease_ID, RU.get_node_property(other_disease_ID, 'description'), jaccard) print(to_print) else: for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print = "%s is similar to the disease %s with similarity value %f" % ( disease_description, RU.get_node_property(other_disease_ID, 'decription'), jaccard) g = RU.get_node_as_graph(other_disease_ID) response.add_subgraph(g.nodes(data=True), g.edges(data=True), to_print, jaccard) response.print()
def get_similar_nodes_in_common(input_node_ID, input_node_label, association_node_label, input_association_relationship, target_association_relationship, target_node_label, threshold=0.2): """ This function returns the nodes that are associated with an input node based on Jaccard index similarity of shared intermediate nodes :param input_node_ID: input node ID (in KG) :param input_node_label: label of the input node :param association_node_label: what kind of node you want to calculate the Jaccard index with :param input_association_relationship: how the input node is connected to the association nodes :param target_association_relationship: how the target node is connected to the association node :param target_node_label: what kind of target nodes to return :param threshold: threshold to compute the Jaccard index :return: a list of tuples, an error_code, and an error_message. tuple[0] is a target node with tuple[1] jaccard index based on association nodes """ # get the description input_node_description = RU.get_node_property(input_node_ID, 'name') # get the nodes associated to the input node input_node_associated_nodes = RU.get_one_hop_target(input_node_label, input_node_ID, association_node_label, input_association_relationship) # Look more steps beyond if we didn't get any physically_interacts_with if input_node_associated_nodes == []: for max_path_len in range(2, 5): input_node_associated_nodes = RU.get_node_names_of_type_connected_to_target(input_node_label, input_node_ID, association_node_label, max_path_len=max_path_len, direction="u") if input_node_associated_nodes: break # Make sure you actually picked up at least one associated node if not input_node_associated_nodes: error_code = "NoNodesFound" error_message = "No %s found for %s." % (association_node_label, input_node_description) return [], error_code, error_message input_node_associated_nodes_set = set(input_node_associated_nodes) # get all the other disease that connect and get the association nodes in common # direct connection node_label_list = [association_node_label] relationship_label_list = [input_association_relationship, target_association_relationship] node_of_interest_position = 0 other_node_IDs_to_intersection_counts = dict() #if target_node_label == "disease" or target_node_label == "disease": # target_labels = ["disease", "disease"] #else: target_labels = [target_node_label] for target_label in target_labels: names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(input_node_ID, input_node_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) for ID in names2counts.keys(): if names2counts[ID] / float(len( input_node_associated_nodes_set)) >= threshold: # if it's below this threshold, no way the Jaccard index will be large enough other_node_IDs_to_intersection_counts[ID] = names2counts[ID] # check if any other associated nodes passed the threshold if not other_node_IDs_to_intersection_counts: error_code = "NoNodesFound" error_message = "No %s were found with similarity crossing the threshold of %f." % (target_node_label, threshold) parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent of %s, so you might try that instead." % ( RU.get_node_property(parent, 'name'), input_node_description) return [], error_code, error_message # Now for each of the nodes connecting to source, count number of association nodes node_label_list = [association_node_label] relationship_label_list = [input_association_relationship, target_association_relationship] node_of_interest_position = 0 other_node_counts = dict() for target_label in target_labels: temp_other_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(input_node_ID, input_node_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) # add it to the dictionary for key in temp_other_counts.keys(): other_node_counts[key] = temp_other_counts[key] # then compute the jaccard index node_jaccard_tuples = [] for other_node_ID in other_node_counts.keys(): jaccard = 0 if other_node_ID in other_node_IDs_to_intersection_counts: union_card = len(input_node_associated_nodes) + other_node_counts[other_node_ID] - \ other_node_IDs_to_intersection_counts[other_node_ID] jaccard = other_node_IDs_to_intersection_counts[other_node_ID] / float(union_card) if jaccard > threshold: node_jaccard_tuples.append((other_node_ID, jaccard)) # Format the results. # Maybe nothing passed the threshold if not node_jaccard_tuples: error_code = "NoNodesFound" error_message = "No %s's were found with similarity crossing the threshold of %f." % (target_node_label, threshold) parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent of %s, so you might try that instead." % (RU.get_node_property(parent, 'description'), input_node_description) return [], error_code, error_message # Otherwise there are results to return, first sort them largest to smallest node_jaccard_tuples_sorted = [(x, y) for x, y in sorted(node_jaccard_tuples, key=lambda pair: pair[1], reverse=True)] return node_jaccard_tuples_sorted, None, None
def answer(self, disease_ID, use_json=False, threshold=0.2): """ Answer the question: what other diseases have similarity >= jaccard=0.2 with the given disease_ID (in terms of phenotype overlap) :param disease_ID: KG disease name (eg. DOID:8398) :param use_json: use the standardized output format :param threshold: only include diseases with Jaccard index above this :return: None (print to stdout), unless there's an error, then return 1 """ # Initialize the response class response = FormatOutput.FormatResponse(4) # Check if node exists if not RU.node_exists_with_property(disease_ID, 'name'): error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID error_code = "DiseaseNotFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # Get label/kind of node the source is disease_label = RU.get_node_property(disease_ID, "label") if disease_label != "disease" and disease_label != "disease": error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \ " Please try a different term" % disease_label error_code = "NotADisease" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # get the description disease_description = RU.get_node_property(disease_ID, 'description') # get the phenotypes associated to the disease disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID, "phenotypic_feature", "has_phenotype") # Look more steps beyond if we didn't get any physically_interacts_with if disease_phenotypes == []: for max_path_len in range(2, 5): disease_phenotypes = RU.get_node_names_of_type_connected_to_target( disease_label, disease_ID, "phenotypic_feature", max_path_len=max_path_len, direction="u") if disease_phenotypes: break # Make sure you actually picked up at least one phenotype if not disease_phenotypes: error_message = "No phenotypes found for this disease." error_code = "NoPhenotypesFound" if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 disease_phenotypes_set = set(disease_phenotypes) # get all the other disease that connect and get the phenotypes in common # direct connection node_label_list = ["phenotypic_feature"] relationship_label_list = ["has_phenotype", "has_phenotype"] node_of_interest_position = 0 other_disease_IDs_to_intersection_counts = dict() for target_label in ["disease", "disease"]: names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label( disease_ID, disease_label, target_label, node_label_list, relationship_label_list, node_of_interest_position) for ID in names2counts.keys(): if names2counts[ID] / float( len(disease_phenotypes_set) ) >= threshold: # if it's below this threshold, no way the Jaccard index will be large enough other_disease_IDs_to_intersection_counts[ ID] = names2counts[ID] # check if any other diseases passed the threshold if not other_disease_IDs_to_intersection_counts: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) response.print() return 1 # Now for each of the diseases connecting to source, count number of phenotypes node_label_list = ["phenotypic_feature"] relationship_label_list = ["has_phenotype", "has_phenotype"] node_of_interest_position = 0 other_doid_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label( disease_ID, disease_label, "disease", node_label_list, relationship_label_list, node_of_interest_position) other_omim_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label( disease_ID, disease_label, "disease", node_label_list, relationship_label_list, node_of_interest_position) # union the two other_disease_counts = dict() for key in other_doid_counts.keys(): other_disease_counts[key] = other_doid_counts[key] for key in other_omim_counts.keys(): other_disease_counts[key] = other_omim_counts[key] # then compute the jaccard index disease_jaccard_tuples = [] for other_disease_ID in other_disease_counts.keys(): jaccard = 0 if other_disease_ID in other_disease_IDs_to_intersection_counts: union_card = len(disease_phenotypes) + other_disease_counts[other_disease_ID] - \ other_disease_IDs_to_intersection_counts[other_disease_ID] jaccard = other_disease_IDs_to_intersection_counts[ other_disease_ID] / float(union_card) if jaccard > threshold: disease_jaccard_tuples.append((other_disease_ID, jaccard)) # Format the results. # Maybe nothing passed the threshold if not disease_jaccard_tuples: error_code = "NoDiseasesFound" error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold parent = RU.get_one_hop_target(disease_label, disease_ID, disease_label, "subclass_of", direction="r") if parent: parent = parent.pop() error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % ( RU.get_node_property(parent, 'description'), disease_description) if not use_json: print(error_message) return 1 else: response.add_error_message(error_code, error_message) return 1 # Otherwise there are results to return, first sort them largest to smallest disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted( disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)] if not use_json: to_print = "The diseases with phenotypes similar to %s are: \n" % disease_description for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print += "%s\t%s\tJaccard %f\n" % ( other_disease_ID, RU.get_node_property(other_disease_ID, 'description'), jaccard) print(to_print) else: for other_disease_ID, jaccard in disease_jaccard_tuples_sorted: to_print = "%s is phenotypically similar to the disease %s with similarity value %f" % ( disease_description, RU.get_node_property(other_disease_ID, 'description'), jaccard) g = RU.get_node_as_graph(other_disease_ID) response.add_subgraph(g.nodes(data=True), g.edges(data=True), to_print, jaccard) response.print()
def answer(self, entity, use_json=False): """ Answer a question of the type "What is X" but is general: :param entity: KG neo4j node name (eg "carbetocin") :param use_json: If the answer should be in Translator standardized API output format :return: a description and type of the node """ #### See if this entity is in the KG via the index eprint("Looking up '%s' in KgNodeIndex" % entity) kgNodeIndex = KGNodeIndex() curies = kgNodeIndex.get_curies(entity) #### If not in the KG, then return no information if not curies: if not use_json: return None else: error_code = "TermNotFound" error_message = "This concept is not in our knowledge graph" response = FormatOutput.FormatResponse(0) response.add_error_message(error_code, error_message) return response.message # Get label/kind of node the source is eprint("Getting properties for '%s'" % curies[0]) properties = RU.get_node_properties(curies[0]) eprint("Properties are:") eprint(properties) #### By default, return the results just as a plain simple list of data structures if not use_json: return properties #### Or, if requested, format the output as the standardized API output format else: #### Create a stub Message object response = FormatOutput.FormatResponse(0) response.message.table_column_names = [ "id", "type", "name", "description", "uri" ] response.message.code_description = None #### Create a Node object and fill it node1 = Node() node1.id = properties["id"] node1.uri = properties["uri"] node1.type = [properties["category"]] node1.name = properties["name"] node1.description = properties["description"] #### Create the first result (potential answer) result1 = Result() result1.id = "http://arax.ncats.io/api/v1/result/0000" result1.description = "The term %s is in our knowledge graph and is defined as %s" % ( properties["name"], properties["description"]) result1.confidence = 1.0 result1.essence = properties["name"] result1.essence_type = properties["category"] node_types = ",".join(node1.type) result1.row_data = [ node1.id, node_types, node1.name, node1.description, node1.uri ] #### Create a KnowledgeGraph object and put the list of nodes and edges into it result_graph = KnowledgeGraph() result_graph.nodes = [node1] result_graph.edges = [] #### Put the ResultGraph into the first result (potential answer) result1.result_graph = result_graph #### Put the first result (potential answer) into the message results = [result1] response.message.results = results #### Also put the union of all result_graph components into the top Message KnowledgeGraph #### Normally the knowledge_graph will be much more complex than this, but take a shortcut for this single-node result response.message.knowledge_graph = result_graph #### Also manufacture a query_graph post hoc qnode1 = QNode() qnode1.id = "n00" qnode1.curie = properties["id"] qnode1.type = None query_graph = QueryGraph() query_graph.nodes = [qnode1] query_graph.edges = [] response.message.query_graph = query_graph #### Create the corresponding knowledge_map node_binding = NodeBinding(qg_id="n00", kg_id=properties["id"]) result1.node_bindings = [node_binding] result1.edge_bindings = [] #eprint(response.message) return response.message
def answer(disease_id, use_json=False, num_show=20, rev=True, normalize=False): """ """ # Initialize the response class response = FormatOutput.FormatResponse(6) # get the description disease_description = RU.get_node_property(disease_id, 'name') # get subgraph of all all the symptom nodes connecting to the disease try: g = RU.return_subgraph_paths_of_type(disease_id, "disease", None, "phenotypic_feature", ["has_phenotype"], directed=False) except CustomExceptions.EmptyCypherError: error_code = "EmptyGraph" error_message = "Sorry, but there are no phenotypes associated to %s" % disease_description response.add_error_message(error_code, error_message) response.print() return 1 # decorate with cohd data RU.weight_graph_with_cohd_frequency( g, normalized=normalize ) # TODO: check if normalized on returns better results # sort the phenotypes by frequency names = nx.get_node_attributes(g, 'names') labels = nx.get_node_attributes(g, 'labels') descriptions = nx.get_node_attributes(g, 'description') # get the node corresponding to the disease disease_node = None for node in names.keys(): if names[node] == disease_id: disease_node = node # get all the nodes and the frequencies in one place node_freq_tuples = [] for node in names.keys(): if "phenotypic_feature" == list(set(labels[node]) - {"Base"}).pop(): # get the corresponding edge frequency (try both directions) edge_data = g.get_edge_data(disease_node, node) if "cohd_freq" in edge_data and isinstance( edge_data["cohd_freq"], float): freq = edge_data["cohd_freq"] else: edge_data = g.get_edge_data(node, disease_node) if "cohd_freq" in edge_data and isinstance( edge_data["cohd_freq"], float): freq = edge_data["cohd_freq"] else: freq = 0 node_freq_tuples.append((node, freq)) # sort the node freqs node_freq_tuples_sorted = sorted(node_freq_tuples, key=lambda x: x[1], reverse=rev) # reduce to top n node_freq_tuples_sorted_top_n = node_freq_tuples_sorted if len(node_freq_tuples_sorted_top_n) > num_show: node_freq_tuples_sorted_top_n = node_freq_tuples_sorted_top_n[ 0:num_show] # good nodes good_nodes = set([tup[0] for tup in node_freq_tuples_sorted_top_n]) good_nodes.add(disease_node) # all nodes all_nodes = set([tup[0] for tup in node_freq_tuples_sorted]) # remove the other nodes from the graph g.remove_nodes_from(all_nodes - good_nodes) # return the results if not use_json: if rev: to_print = "The most common phenotypes " else: to_print = "The least common phenotypes " to_print += "associated with %s, according to the Columbia Open Health Data, are:\n" % disease_description for node, freq in node_freq_tuples_sorted_top_n: to_print += "phenotype: %s\t frequency %f \n" % ( descriptions[node], freq) print(to_print) else: for node, freq in node_freq_tuples_sorted_top_n: to_print = "According to the Columbia Open Health Data, %s has the phenotype %s with frequency %f." % ( disease_description, descriptions[node], freq) sub_g = nx.subgraph(g, [disease_node, node]) # add it to the response response.add_subgraph(sub_g.nodes(data=True), sub_g.edges(data=True), to_print, freq) response.print()
def answer(self, query_graph, TxltrApiFormat=False): """ Answer a question based on the input query_graph: :param query_graph: QueryGraph object :param TxltrApiFormat: Set to true if the answer should be in Translator standardized API output format :return: Result of the query in native or API format """ #### Create a stub Message object response = FormatOutput.FormatResponse(0) #### Include the original query_graph in the envelope response.message.query_graph = query_graph #### Perform some basic validation of the query graph before sending to the server result = self.validate_query_graph(query_graph) if result["message_code"] != "OK": response.add_error_message(result["message_code"], result["code_description"]) return (response.message) #### Insert some dummy question stuff response.message.original_question = "Input via Query Graph" response.message.restated_question = "No restatement for QueryGraph yet" #### Preprocess query_graph object query_graph, sort_flags, res_limit, ascending_flag = self.preprocess_query_graph( query_graph) #### Interpret the query_graph object to create a cypher query and encode the result in a response try: query_gen = RU.get_cypher_from_question_graph( {'question_graph': query_graph}) answer_graph_cypher = query_gen.cypher_query_answer_map() knowledge_graph_cypher = query_gen.cypher_query_knowledge_graph() except Exception as error: response.add_error_message("CypherGenerationError", format(error)) return (response.message) #### The Robokop code renames stuff in the query_graph for strange reasons. Rename them back. #### It would be better to not make the changes in the first place. FIXME #for node in response.message.query_graph["nodes"]: # node["node_id"] = node["id"] # node.pop("id", None) #for edge in response.message.query_graph["edges"]: # edge["edge_id"] = edge["id"] # edge.pop("id", None) #### Execute the cypher to obtain results[]. Return an error if there are no results, or otherwise extract the list try: with RU.driver.session() as session: result = session.run(answer_graph_cypher) answer_graph_list = result.data() except Exception as error: response.add_error_message("QueryGraphError", format(error)) return (response.message) if len(answer_graph_list) == 0: response.add_error_message( "NoPathsFound", "No paths satisfying this query graph were found") return (response.message) #### Execute the knowledge_graph cypher. Return an error if there are no results, or otherwise extract the dict try: with RU.driver.session() as session: result = session.run(knowledge_graph_cypher) result_data = result.data() except Exception as error: response.add_error_message("QueryGraphError", format(error)) return (response.message) if len(result_data) == 0: response.add_error_message( "NoPathsFound", "No paths satisfying this query graph were found") return (response.message) knowledge_graph_dict = result_data[0] #### If TxltrApiFormat was not specified, just return a single data structure with the results if not TxltrApiFormat: return { 'answer_subgraphs': answer_graph_list, 'knowledge_graph': knowledge_graph_dict } #### Add the knowledge_graph and bindings to the Message response.add_split_results(knowledge_graph_dict, answer_graph_list) #response.message.table_column_names = [ "id", "type", "name", "description", "uri" ] #response.message.code_description = None #### Enrich the Message Results with some inferred information response.infer_result_information() #### Return the final result message return (response.message)
def main(): parser = argparse.ArgumentParser( description= "Answers questions of the form: 'what pathways are most enriched by $protein_list?'", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-s', '--source', type=str, help="source curie ID", default="UniProtKB:Q96M43") parser.add_argument('-t', '--target', type=str, help="target node type", default="pathway") parser.add_argument('-y', '--type', type=str, help="source node type", default="protein") parser.add_argument( '-j', '--json', action='store_true', help= 'Flag specifying that results should be printed in JSON format (to stdout)', default=False) parser.add_argument( '-r', '--rel_type', type=str, help='Only do the Fisher exact test along edges of this type', default=None) parser.add_argument( '--describe', action='store_true', help='Print a description of the question to stdout and quit', default=False) parser.add_argument('--num_show', type=int, help='Maximum number of results to return', default=20) # Parse and check args args = parser.parse_args() source_arg = args.source target_type = args.target source_type = args.type use_json = args.json describe_flag = args.describe num_show = args.num_show rel_type = args.rel_type if source_arg[0] == "[": if "','" not in source_arg: source_arg = source_arg.replace(",", "','").replace("[", "['").replace( "]", "']") source_list = ast.literal_eval(source_arg) source_list_strip = [] for source in source_list: source_list_strip.append(source.strip()) source_list = source_list_strip else: source_list = [source_arg] # Initialize the question class Q = QuestionFisher() if describe_flag: res = Q.describe() print(res) else: # Initialize the response class response = FormatOutput.FormatResponse(6) response.response.table_column_names = [ "target name", "target ID", "P value" ] graph_weight_tuples = [] q_answer = Q.answer(source_list, source_type, target_type, use_json=use_json, num_show=num_show, rel_type=rel_type) if not q_answer: # if q_answer == None return None # All messages printed out; safe to quit p_dict, target_list = q_answer # print out the results if not use_json: for target_name in target_list: target_description = RU.get_node_property( target_name, "name", node_label=target_type) print("%s %f" % (target_description, p_dict[target_name])) else: #response.response.table_column_names = ["source name", "source ID", "target name", "target ID", "path weight", # "target source google distance", # "ML probability target treats source"] for target_name in target_list: target_description = RU.get_node_property( target_name, "name", node_label=target_type) target_id_old_curie = target_name.replace( "CHEMBL.COMPOUND:CHEMBL", "ChEMBL:") confidence = p_dict[target_name] # populate the graph graph = RU.get_graph_from_nodes([target_name]) res = response.add_subgraph( graph.nodes(data=True), graph.edges(data=True), "The target %s is enriched by %s." % (target_description, str(source_list)), confidence, return_result=True) res.essence = "%s" % target_description # populate with essence of question result row_data = [] # initialize the row data #row_data.append("%s" % source_description) #row_data.append("%s" % source_id) row_data.append("%s" % target_description) row_data.append("%s" % target_name) row_data.append("%f" % confidence) #row_data.append("%f" % gd) #row_data.append("%f" % prob) res.row_data = row_data response.print()
def get_parameters(self, input_question): """ Given the input_question, try to extract the proper parameters :param input_question: plain text input question :return: a dictionary (with keys self.parameter_names), values either None or the KG node names """ parameters = dict() for parameter in self.parameter_names: parameters[parameter] = None # The "what is a X?" questions are of a completely different form and are handled separately if self.parameter_names == ["term"]: # Next, see if it's a "what is" question term = None input_question = re.sub("\?", "", input_question) input_question = re.sub("^\s+", "", input_question) input_question = re.sub("\s+$", "", input_question) input_question = input_question.lower() match = re.match("what is\s*(a|an)?\s+(.+)", input_question, re.I) if match: term = match.group(2) term = re.sub("^\s+", "", term) term = re.sub("\s+$", "", term) parameters["term"] = term return parameters match = re.match("what are (.+)", input_question, re.I) if match: term = match.group(1) term = re.sub("^\s+", "", term) term = re.sub("\s+$", "", term) parameters["term"] = term return parameters else: return parameters else: # Otherwise, it's a standard question template # get all n-tuples of words in the question (largest to smallest) blocks = [] question_tokenized = nltk.word_tokenize(input_question, "english") # Tokenizers have a bad habit of splitting on \', so fix it question_tokenized_no_apos_split = [] for ind, block in enumerate(question_tokenized): if block[0] == "'" and ind > 0: # the tokenizer split on apostrophe question_tokenized_no_apos_split[ind - 1] += question_tokenized[ind] else: question_tokenized_no_apos_split.append(block) question_tokenized = question_tokenized_no_apos_split for block_size in range(1, len(question_tokenized)): for i in range(len(question_tokenized) - block_size + 1): block = " ".join(question_tokenized[i:(i + block_size)]) blocks.append(block) blocks = list(reversed(blocks)) # Look for anything that could be a node name candidate_node_names = [] found_blocks = [] # keep track of the already found blocks TODO: this will cause problems when you ask something like "how are malaria and mixed malaria different?" for block in blocks: nodes = find_node_name(block) if nodes: if all([block not in b for b in found_blocks]): # only add it if it's not a proper subset of an already found block candidate_node_names.extend(nodes) found_blocks.append(block) #print(block) # Get the node labels for the found nodes candidate_node_names_labels = set() # set automatically deduplicates for me for node in candidate_node_names: node_label = RU.get_node_property(node, "label") # TODO: Arnab's UMLS lookup candidate_node_names_labels.add((node, node_label)) # turn it back into a set for indexing candidate_node_names_labels = list(candidate_node_names_labels) # For each of the parameter names, make sure it only shows up once, and if so, populate it for parameter_name in self.parameter_names: parameter_name_positions = [] pos = 0 for node, node_label in candidate_node_names_labels: if node_label == parameter_name: parameter_name_positions.append(pos) pos += 1 if len(parameter_name_positions) > 1: raise CustomExceptions.MultipleTerms(parameter_name, [candidate_node_names_labels[pos][0] for pos in parameter_name_positions]) elif len(parameter_name_positions) == 0: pass else: # There's exactly one term pos = parameter_name_positions.pop() parameters[parameter_name] = candidate_node_names_labels[pos][0] # Throw in the extra parameters for key, value in self.other_parameters.items(): parameters[key] = value return parameters
def answer(self, source_name, target_label, relationship_type, use_json=False, directed=False): """ Answer a question of the type "What proteins does drug X target" but is general: what <node X type> does <node Y grounded> <relatioship Z> that can be answered in one hop in the KG (increasing the step size if necessary). :param query_terms: a triple consisting of a source node name (KG neo4j node name, the target label (KG neo4j "node label") and the relationship type (KG neo4j "Relationship type") :param source_name: KG neo4j node name (eg "carbetocin") :param target_label: KG node label (eg. "protein") :param relationship_type: KG relationship type (eg. "physically_interacts_with") :param use_json: If the answer should be in Eric's Json standardized API output format :return: list of dictionaries containing the nodes that are one hop (along relationship type) that connect source to target. """ # Get label/kind of node the source is source_label = RU.get_node_property(source_name, "label") # Get the subgraph (all targets along relationship) has_intermediate_node = False try: g = RU.return_subgraph_paths_of_type(source_name, source_label, None, target_label, [relationship_type], directed=directed) except CustomExceptions.EmptyCypherError: try: has_intermediate_node = True g = RU.return_subgraph_paths_of_type( source_name, source_label, None, target_label, ['subclass_of', relationship_type], directed=directed) except CustomExceptions.EmptyCypherError: error_message = "No path between %s and %s via relationship %s" % ( source_name, target_label, relationship_type) error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) return response # extract the source_node_number for node, data in g.nodes(data=True): if data['properties']['id'] == source_name: source_node_number = node break # Get all the target numbers target_numbers = [] for node, data in g.nodes(data=True): if data['properties']['id'] != source_name: target_numbers.append(node) # if there's an intermediate node, get the name if has_intermediate_node: neighbors = list(g.neighbors(source_node_number)) if len(neighbors) > 1: error_message = "More than one intermediate node" error_code = "AmbiguousPath" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) return response else: intermediate_node = neighbors.pop() #### If use_json not specified, then return results as a fairly plain list if not use_json: results_list = list() for target_number in target_numbers: data = g.nodes[target_number] results_list.append({ 'type': list(set(data['labels']) - {'Base'}).pop(), 'name': data['properties']['name'], 'desc': data['properties']['name'], 'prob': 1 }) # All these are known to be true return results_list #### Else if use_json requested, return the results in the Translator standard API JSON format else: response = FormatOutput.FormatResponse(3) # it's a Q3 question response.message.table_column_names = [ "source name", "source ID", "target name", "target ID" ] source_description = g.nodes[source_node_number]['properties'][ 'name'] #### Create the QueryGraph for this type of question query_graph = QueryGraph() source_node = QNode() source_node.id = "n00" source_node.curie = g.nodes[source_node_number]['properties']['id'] source_node.type = g.nodes[source_node_number]['properties'][ 'category'] target_node = QNode() target_node.id = "n01" target_node.type = target_label query_graph.nodes = [source_node, target_node] edge1 = QEdge() edge1.id = "e00" edge1.source_id = "n00" edge1.target_id = "n01" edge1.type = relationship_type query_graph.edges = [edge1] response.message.query_graph = query_graph #### Create a mapping dict with the source curie and the target type. This dict is used for reverse lookups by type #### for mapping to the QueryGraph. response._type_map = dict() response._type_map[source_node.curie] = source_node.id response._type_map[target_node.type] = target_node.id response._type_map[edge1.type] = edge1.id #### Loop over all the returned targets and put them into the response structure for target_number in target_numbers: target_description = g.nodes[target_number]['properties'][ 'name'] if not has_intermediate_node: subgraph = g.subgraph([source_node_number, target_number]) else: subgraph = g.subgraph( [source_node_number, intermediate_node, target_number]) res = response.add_subgraph( subgraph.nodes(data=True), subgraph.edges(data=True), "%s and %s are connected by the relationship %s" % (source_description, target_description, relationship_type), 1, return_result=True) res.essence = "%s" % target_description # populate with essence of question result res.essence_type = g.nodes[target_number]['properties'][ 'category'] # populate with the type of the essence of question result row_data = [] # initialize the row data row_data.append("%s" % source_description) row_data.append( "%s" % g.nodes[source_node_number]['properties']['id']) row_data.append("%s" % target_description) row_data.append("%s" % g.nodes[target_number]['properties']['id']) res.row_data = row_data return response
def answer(self, source_name, target_label, relationship_type, use_json=False): """ Answer a question of the type "What proteins does drug X target" but is general: what <node X type> does <node Y grounded> <relatioship Z> that can be answered in one hop in the KG (increasing the step size if necessary). :param query_terms: a triple consisting of a source node name (KG neo4j node name, the target label (KG neo4j "node label") and the relationship type (KG neo4j "Relationship type") :param source_name: KG neo4j node name (eg "carbetocin") :param target_label: KG node label (eg. "protein") :param relationship_type: KG relationship type (eg. "directly_interacts_with") :param use_json: If the answer should be in Eric's Json standardized API output format :return: list of dictionaries containing the nodes that are one hop (along relationship type) that connect source to target. """ # Get label/kind of node the source is source_label = RU.get_node_property(source_name, "label") # Get the subgraph (all targets along relationship) has_intermediate_node = False try: g = RU.return_subgraph_paths_of_type(source_name, source_label, None, target_label, [relationship_type], directed=False) except CustomExceptions.EmptyCypherError: try: has_intermediate_node = True g = RU.return_subgraph_paths_of_type( source_name, source_label, None, target_label, ['subclass_of', relationship_type], directed=False) except CustomExceptions.EmptyCypherError: error_message = "No path between %s and %s via relationship %s" % ( source_name, target_label, relationship_type) error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) return response # extract the source_node_number for node, data in g.nodes(data=True): if data['properties']['name'] == source_name: source_node_number = node break # Get all the target numbers target_numbers = [] for node, data in g.nodes(data=True): if data['properties']['name'] != source_name: target_numbers.append(node) # if there's an intermediate node, get the name if has_intermediate_node: neighbors = list(g.neighbors(source_node_number)) if len(neighbors) > 1: error_message = "More than one intermediate node" error_code = "AmbiguousPath" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) return response else: intermediate_node = neighbors.pop() # Format the results. if not use_json: results_list = list() for target_number in target_numbers: data = g.node[target_number] results_list.append({ 'type': list(set(data['labels']) - {'Base'}).pop(), 'name': data['properties']['name'], 'desc': data['properties']['description'], 'prob': 1 }) # All these are known to be true return results_list else: # You want the standardized API output format response = FormatOutput.FormatResponse(3) # it's a Q3 question source_description = g.node[source_node_number]['properties'][ 'description'] for target_number in target_numbers: target_description = g.node[target_number]['properties'][ 'description'] if not has_intermediate_node: subgraph = g.subgraph([source_node_number, target_number]) else: subgraph = g.subgraph( [source_node_number, intermediate_node, target_number]) response.add_subgraph( subgraph.nodes(data=True), subgraph.edges(data=True), "%s and %s are connected by the relationship %s" % (source_description, target_description, relationship_type), 1) return response
def answer(disease_id, use_json=False, num_show=25): num_input_disease_symptoms = 25 # number of representative symptoms of the disease to keep num_omim_keep = 25 # number of genetic conditions to keep num_protein_keep = 25 # number of implicated proteins to keep num_pathways_keep = 25 # number of pathways to keep num_pathway_proteins_selected = 25 # number of proteins enriched for the above pathways to select num_drugs_keep = 2 * num_show # number of drugs that target those proteins to keep num_paths = 2 # number of paths to keep for each drug selected # Initialize the response class response = FormatOutput.FormatResponse(6) response.response.table_column_names = [ "disease name", "disease ID", "drug name", "drug ID", "confidence" ] # get the description of the disease disease_description = RU.get_node_property(disease_id, 'name') # Find symptoms of disease # symptoms = RU.get_one_hop_target("disease", disease_id, "phenotypic_feature", "has_phenotype") # symptoms_set = set(symptoms) (symptoms_dict, symptoms) = RU.top_n_fisher_exact([disease_id], "disease", "phenotypic_feature", rel_type="has_phenotype", n=num_input_disease_symptoms) symptoms_set = set(symptoms) # check for an error if not symptoms_set: error_message = "I found no phenotypic_features for %s." % disease_description if not use_json: print(error_message) return else: error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) response.print() return # Find diseases enriched for that phenotype path_type = [ "gene_mutations_contribute_to", "protein", "participates_in", "pathway", "participates_in", "protein", "physically_interacts_with", "chemical_substance" ] (genetic_diseases_dict, genetic_diseases_selected) = RU.top_n_fisher_exact( symptoms, "phenotypic_feature", "disease", rel_type="has_phenotype", n=num_omim_keep, curie_prefix="OMIM", on_path=path_type, exclude=disease_id) if not genetic_diseases_selected: error_message = "I found no diseases connected to phenotypes of %s." % disease_description if not use_json: print(error_message) return else: error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) response.print() return # find the most representative proteins in these diseases path_type = [ "participates_in", "pathway", "participates_in", "protein", "physically_interacts_with", "chemical_substance" ] (implicated_proteins_dict, implicated_proteins_selected) = RU.top_n_fisher_exact( genetic_diseases_selected, "disease", "protein", rel_type="gene_mutations_contribute_to", n=num_protein_keep, on_path=path_type) if not implicated_proteins_selected: error_message = "I found no proteins connected to diseases connected to phenotypes of %s." % disease_description if not use_json: print(error_message) return else: error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) response.print() return # find enriched pathways from those proteins path_type = [ "participates_in", "protein", "physically_interacts_with", "chemical_substance" ] (pathways_selected_dict, pathways_selected) = RU.top_n_fisher_exact( implicated_proteins_selected, "protein", "pathway", rel_type="participates_in", n=num_pathways_keep, on_path=path_type) if not pathways_selected: error_message = "I found no pathways connected to proteins connected to diseases connected to phenotypes of %s." % disease_description if not use_json: print(error_message) return else: error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) response.print() return # find proteins enriched for those pathways path_type = ["physically_interacts_with", "chemical_substance"] (pathway_proteins_dict, pathway_proteins_selected) = RU.top_n_fisher_exact( pathways_selected, "pathway", "protein", rel_type="participates_in", n=num_pathway_proteins_selected, on_path=path_type) if not pathway_proteins_selected: error_message = "I found no proteins connected to pathways connected to proteins connected to diseases connected to phenotypes of %s." % disease_description if not use_json: print(error_message) return else: error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) response.print() return # find drugs enriched for targeting those proteins (drugs_selected_dict, drugs_selected) = RU.top_n_fisher_exact( pathway_proteins_selected, "protein", "chemical_substance", rel_type="physically_interacts_with", n=num_drugs_keep) if not drugs_selected: error_message = "I found no drugs connected toproteins connected to pathways connected to proteins connected to diseases connected to phenotypes of %s." % disease_description if not use_json: print(error_message) return else: error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) response.print() return path_type = [ "disease", "has_phenotype", "phenotypic_feature", "has_phenotype", "disease", "gene_mutations_contribute_to", "protein", "participates_in", "pathway", "participates_in", "protein", "physically_interacts_with", "chemical_substance" ] g = RU.get_subgraph_through_node_sets_known_relationships( path_type, [[disease_id], symptoms, genetic_diseases_selected, implicated_proteins_selected, pathways_selected, pathway_proteins_selected, drugs_selected], directed=True) graph_weight_tuples = [] for drug in drugs_selected: # get the relevant subgraph from this drug back to the input disease node_types = [ "disease", "phenotypic_feature", "disease", "protein", "pathway", "protein", "chemical_substance" ] drug_pathway_protein_neighbors = RU.one_hope_neighbors_of_type( g, drug, 'protein', 'R') drug_pathway_neighbors = set() for protein in drug_pathway_protein_neighbors: drug_pathway_neighbors.update( RU.one_hope_neighbors_of_type(g, protein, 'pathway', 'R')) drug_protein_neighbors = set() for pathway in drug_pathway_neighbors: drug_protein_neighbors.update( RU.one_hope_neighbors_of_type(g, pathway, 'protein', 'L')) drug_disease_neighbors = set() for protein in drug_protein_neighbors: drug_disease_neighbors.update( RU.one_hope_neighbors_of_type(g, protein, 'disease', 'R')) drug_phenotype = set() for disease in drug_disease_neighbors: drug_phenotype.update( RU.one_hope_neighbors_of_type(g, disease, 'phenotypic_feature', 'R')) g2 = RU.get_subgraph_through_node_sets_known_relationships( path_type, [[disease_id], drug_phenotype, drug_disease_neighbors, drug_protein_neighbors, drug_pathway_neighbors, drug_pathway_protein_neighbors, [drug]], directed=False) drug_id_old_curie = drug.replace("CHEMBL.COMPOUND:CHEMBL", "ChEMBL:") # Machine learning probability of "treats" prob = p.prob_single(drug_id_old_curie, disease_id) if not prob: prob = -1 else: prob = prob[0] graph_weight_tuples.append((g, prob, drug)) # sort by the path weight graph_weight_tuples.sort(key=lambda x: x[1], reverse=True) # print out the results if not use_json: num_shown = 0 for graph, weight, drug_id in graph_weight_tuples: num_shown += 1 if num_shown > num_show: break drug_description = RU.get_node_property( drug_id, "name", node_label="chemical_substance") drug_id_old_curie = drug_id.replace("CHEMBL.COMPOUND:CHEMBL", "ChEMBL:") # Machine learning probability of "treats" prob = p.prob_single(drug_id_old_curie, disease_id) if not prob: prob = -1 else: prob = prob[0] print("%s %f %f" % (drug_description, weight, prob)) else: # add the neighborhood graph response.add_neighborhood_graph(g.nodes(data=True), g.edges(data=True)) response.response.table_column_names = [ "disease name", "disease ID", "drug name", "drug ID", "path weight", "drug disease google distance", "ML probability drug treats disease" ] num_shown = 0 for graph, weight, drug_id in graph_weight_tuples: num_shown += 1 if num_shown > num_show: break drug_description = RU.get_node_property( drug_id, "name", node_label="chemical_substance") drug_id_old_curie = drug_id.replace("CHEMBL.COMPOUND:CHEMBL", "ChEMBL:") # Machine learning probability of "treats" prob = p.prob_single(drug_id_old_curie, disease_id) if not prob: prob = -1 else: prob = prob[0] confidence = prob # Google distance gd = NormGoogleDistance.get_ngd_for_all( [drug_id, disease_id], [drug_description, disease_description]) # populate the graph res = response.add_subgraph( graph.nodes(data=True), graph.edges(data=True), "The drug %s is predicted to treat %s." % (drug_description, disease_description), confidence, return_result=True) res.essence = "%s" % drug_description # populate with essence of question result row_data = [] # initialize the row data row_data.append("%s" % disease_description) row_data.append("%s" % disease_id) row_data.append("%s" % drug_description) row_data.append("%s" % drug_id) row_data.append("%f" % weight) row_data.append("%f" % gd) row_data.append("%f" % prob) res.row_data = row_data response.print()
def answer(disease_id, use_json=False, num_show=25): num_input_disease_symptoms = 25 # number of representative symptoms of the disease to keep num_omim_keep = 25 # number of genetic conditions to keep num_protein_keep = 25 # number of implicated proteins to keep num_pathways_keep = 25 # number of pathways to keep num_pathway_proteins_selected = 25 # number of proteins enriched for the above pathways to select num_drugs_keep = num_show # number of drugs that target those proteins to keep num_paths = 2 # number of paths to keep for each drug selected # Initialize the response class response = FormatOutput.FormatResponse(6) response.response.table_column_names = [ "disease name", "disease ID", "drug name", "drug ID", "confidence" ] # get the description of the disease disease_description = RU.get_node_property(disease_id, 'name') # Find symptoms of disease # symptoms = RU.get_one_hop_target("disease", disease_id, "phenotypic_feature", "has_phenotype") # symptoms_set = set(symptoms) (symptoms_dict, symptoms) = RU.top_n_fisher_exact([disease_id], "disease", "phenotypic_feature", rel_type="has_phenotype", n=num_input_disease_symptoms) symptoms_set = set(symptoms) # check for an error if not symptoms_set: error_message = "I found no phenotypic_features for %s." % disease_description if not use_json: print(error_message) return else: error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) response.print() return # Find diseases enriched for that phenotype path_type = [ "gene_mutations_contribute_to", "protein", "participates_in", "pathway", "participates_in", "protein", "physically_interacts_with", "chemical_substance" ] (genetic_diseases_dict, genetic_diseases_selected) = RU.top_n_fisher_exact( symptoms, "phenotypic_feature", "disease", rel_type="has_phenotype", n=num_omim_keep, curie_prefix="OMIM", on_path=path_type, exclude=disease_id) if not genetic_diseases_selected: error_message = "I found no diseases connected to phenotypes of %s." % disease_description if not use_json: print(error_message) return else: error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) response.print() return # find the most representative proteins in these diseases path_type = [ "participates_in", "pathway", "participates_in", "protein", "physically_interacts_with", "chemical_substance" ] (implicated_proteins_dict, implicated_proteins_selected) = RU.top_n_fisher_exact( genetic_diseases_selected, "disease", "protein", rel_type="gene_mutations_contribute_to", n=num_protein_keep, on_path=path_type) if not implicated_proteins_selected: error_message = "I found no proteins connected to diseases connected to phenotypes of %s." % disease_description if not use_json: print(error_message) return else: error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) response.print() return # find enriched pathways from those proteins path_type = [ "participates_in", "protein", "physically_interacts_with", "chemical_substance" ] (pathways_selected_dict, pathways_selected) = RU.top_n_fisher_exact( implicated_proteins_selected, "protein", "pathway", rel_type="participates_in", n=num_pathways_keep, on_path=path_type) if not pathways_selected: error_message = "I found no pathways connected to proteins connected to diseases connected to phenotypes of %s." % disease_description if not use_json: print(error_message) return else: error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) response.print() return # find proteins enriched for those pathways path_type = ["physically_interacts_with", "chemical_substance"] (pathway_proteins_dict, pathway_proteins_selected) = RU.top_n_fisher_exact( pathways_selected, "pathway", "protein", rel_type="participates_in", n=num_pathway_proteins_selected, on_path=path_type) if not pathway_proteins_selected: error_message = "I found no proteins connected to pathways connected to proteins connected to diseases connected to phenotypes of %s." % disease_description if not use_json: print(error_message) return else: error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) response.print() return # find drugs enriched for targeting those proteins (drugs_selected_dict, drugs_selected) = RU.top_n_fisher_exact( pathway_proteins_selected, "protein", "chemical_substance", rel_type="physically_interacts_with", n=num_drugs_keep) if not drugs_selected: error_message = "I found no drugs connected toproteins connected to pathways connected to proteins connected to diseases connected to phenotypes of %s." % disease_description if not use_json: print(error_message) return else: error_code = "NoPathsFound" response = FormatOutput.FormatResponse(3) response.add_error_message(error_code, error_message) response.print() return # Next, find the most likely paths # extract the relevant subgraph path_type = [ "disease", "has_phenotype", "phenotypic_feature", "has_phenotype", "disease", "gene_mutations_contribute_to", "protein", "participates_in", "pathway", "participates_in", "protein", "physically_interacts_with", "chemical_substance" ] g = RU.get_subgraph_through_node_sets_known_relationships( path_type, [[disease_id], symptoms, genetic_diseases_selected, implicated_proteins_selected, pathways_selected, pathway_proteins_selected, drugs_selected]) # decorate graph with fisher p-values # get dict of id to nx nodes nx_node_to_id = nx.get_node_attributes(g, "names") nx_id_to_node = dict() # reverse the dictionary for node in nx_node_to_id.keys(): id = nx_node_to_id[node] nx_id_to_node[id] = node i = 0 for u, v, d in g.edges(data=True): u_id = nx_node_to_id[u] v_id = nx_node_to_id[v] # decorate correct nodes # input disease to symptoms, decorated by symptom p-value if (u_id in symptoms_set and v_id == disease_id) or (v_id in symptoms_set and u_id == disease_id): try: d["p_value"] = symptoms_dict[v_id] except: d["p_value"] = symptoms_dict[u_id] continue # symptom to disease, decorated by disease p-value if (u_id in symptoms_set and v_id in genetic_diseases_dict) or ( v_id in symptoms_set and u_id in genetic_diseases_dict): try: d["p_value"] = genetic_diseases_dict[v_id] except: d["p_value"] = genetic_diseases_dict[u_id] continue # disease to protein if (u_id in genetic_diseases_dict and v_id in implicated_proteins_dict) or ( v_id in genetic_diseases_dict and u_id in implicated_proteins_dict): try: d["p_value"] = implicated_proteins_dict[v_id] except: d["p_value"] = implicated_proteins_dict[u_id] continue # protein to pathway if (u_id in implicated_proteins_dict and v_id in pathways_selected_dict) or ( v_id in implicated_proteins_dict and u_id in pathways_selected_dict): try: d["p_value"] = pathways_selected_dict[v_id] except: d["p_value"] = pathways_selected_dict[u_id] continue # pathway to protein if (u_id in pathways_selected_dict and v_id in pathway_proteins_dict) or ( v_id in pathways_selected_dict and u_id in pathway_proteins_dict): try: d["p_value"] = pathway_proteins_dict[v_id] except: d["p_value"] = pathway_proteins_dict[u_id] continue # protein to drug if (u_id in pathway_proteins_dict and v_id in drugs_selected_dict ) or (v_id in pathway_proteins_dict and u_id in drugs_selected_dict): try: d["p_value"] = drugs_selected_dict[v_id] except: d["p_value"] = drugs_selected_dict[u_id] continue # otherwise, stick a p_value of 1 d["p_value"] = 1 # decorate with COHD data RU.weight_disease_phenotype_by_cohd( g, max_phenotype_oxo_dist=2, default_value=1 ) # automatically pulls it out to top-level property # decorate with drug->target binding probability RU.weight_graph_with_property( g, "probability", default_value=1, transformation=lambda x: x) # pulls it out to top level property # transform the graph properties so they all point the same direction # will be finding shortest paths, so make 0=bad, 1=good transform to 0=good, 1=bad RU.transform_graph_weight( g, "cohd_freq", default_value=0, transformation=lambda x: 1 / float(x + .001) - 1 / (1 + .001)) RU.transform_graph_weight( g, "probability", default_value=0, transformation=lambda x: 1 / float(x + .001) - 1 / (1 + .001)) # merge the graph properties (additively) RU.merge_graph_properties(g, ["p_value", "cohd_freq", "probability"], "merged", operation=lambda x, y: x + y) graph_weight_tuples = [] for drug in drugs_selected: decorated_paths, decorated_path_edges, path_lengths = RU.get_top_shortest_paths( g, disease_id, drug, num_paths, property='merged') for path_ind in range(num_paths): g2 = nx.Graph() path = decorated_paths[path_ind] for node_prop in path: node_uuid = node_prop['properties']['UUID'] g2.add_node(node_uuid, **node_prop) path = decorated_path_edges[path_ind] for edge_prop in path: source_uuid = edge_prop['properties']['source_node_uuid'] target_uuid = edge_prop['properties']['target_node_uuid'] g2.add_edge(source_uuid, target_uuid, **edge_prop) graph_weight_tuples.append((g2, path_lengths[path_ind], drug)) # sort by the path weight graph_weight_tuples.sort(key=lambda x: x[1]) # print out the results if not use_json: for graph, weight, drug_id in graph_weight_tuples: drug_description = RU.get_node_property( drug_id, "name", node_label="chemical_substance") print("%s %f" % (drug_description, weight)) else: response.response.table_column_names = [ "disease name", "disease ID", "drug name", "drug ID", "path weight", "drug disease google distance", "ML probability drug treats disease" ] for graph, weight, drug_id in graph_weight_tuples: drug_description = RU.get_node_property( drug_id, "name", node_label="chemical_substance") drug_id_old_curie = drug_id.replace("CHEMBL.COMPOUND:CHEMBL", "ChEMBL:") # Machine learning probability of "treats" prob = p.prob_single(drug_id_old_curie, disease_id) if not prob: prob = -1 else: prob = prob[0] confidence = prob # Google distance gd = NormGoogleDistance.get_ngd_for_all( [drug_id, disease_id], [drug_description, disease_description]) # populate the graph res = response.add_subgraph( graph.nodes(data=True), graph.edges(data=True), "The drug %s is predicted to treat %s." % (drug_description, disease_description), confidence, return_result=True) res.essence = "%s" % drug_description # populate with essence of question result row_data = [] # initialize the row data row_data.append("%s" % disease_description) row_data.append("%s" % disease_id) row_data.append("%s" % drug_description) row_data.append("%s" % drug_id) row_data.append("%f" % weight) row_data.append("%f" % gd) row_data.append("%f" % prob) res.row_data = row_data response.print()
def answer(source_node_ID, target_node_type, association_node_type, use_json=False, threshold=0.2, n=20): """ Answers the question what X are similar to Y based on overlap of common Z nodes. X is target_node_type, Y is source_node_ID, Z is association_node_type. The relationships are automatically determined in SimilarNodesInCommon by looking for 1 hop relationships and poping the FIRST one (you are warned). :param source_node_ID: actual name in the KG :param target_node_type: kinds of nodes you want returned :param association_node_type: kind of node you are computing the Jaccard overlap on :param use_json: print the results in standardized format :param threshold: only return results where jaccard is >= this threshold :param n: number of results to return (default 20) :return: reponse (or printed text) """ # Initialize the response class response = FormatOutput.FormatResponse(5) # Initialize the similar nodes class similar_nodes_in_common = SimilarNodesInCommon.SimilarNodesInCommon() # get the description source_node_description = RU.get_node_property(source_node_ID, 'description') # get the source node label source_node_label = RU.get_node_property(source_node_ID, 'label') # Get the nodes in common node_jaccard_tuples_sorted, error_code, error_message = similar_nodes_in_common.get_similar_nodes_in_common_source_target_association( source_node_ID, target_node_type, association_node_type, threshold) # reduce to top 100 if len(node_jaccard_tuples_sorted) > n: node_jaccard_tuples_sorted = node_jaccard_tuples_sorted[0:n] # make sure that the input node isn't in the list node_jaccard_tuples_sorted = [ i for i in node_jaccard_tuples_sorted if i[0] != source_node_ID ] # check for an error if error_code is not None or error_message is not None: if not use_json: print(error_message) return else: response.add_error_message(error_code, error_message) response.print() return # Otherwise return the results if not use_json: to_print = "The %s's involving similar %s's as %s are: \n" % ( target_node_type, association_node_type, source_node_description) for other_disease_ID, jaccard in node_jaccard_tuples_sorted: to_print += "%s\t%s\tJaccard %f\n" % ( other_disease_ID, RU.get_node_property(other_disease_ID, 'description'), jaccard) print(to_print) else: node_jaccard_ID_sorted = [ id for id, jac in node_jaccard_tuples_sorted ] # print(RU.return_subgraph_through_node_labels(source_node_ID, source_node_label, node_jaccard_ID_sorted, target_node_type, # [association_node_type], with_rel=[], directed=True, debug=True)) # get the entire subgraph g = RU.return_subgraph_through_node_labels(source_node_ID, source_node_label, node_jaccard_ID_sorted, target_node_type, [association_node_type], with_rel=[], directed=False, debug=False) # extract the source_node_number for node, data in g.nodes(data=True): if data['properties']['name'] == source_node_ID: source_node_number = node break # Get all the target numbers target_id2numbers = dict() node_jaccard_ID_sorted_set = set(node_jaccard_ID_sorted) for node, data in g.nodes(data=True): if data['properties']['name'] in node_jaccard_ID_sorted_set: target_id2numbers[data['properties']['name']] = node for other_disease_ID, jaccard in node_jaccard_tuples_sorted: to_print = "The %s %s involves similar %s's as %s with similarity value %f" % ( target_node_type, RU.get_node_property(other_disease_ID, 'description'), association_node_type, source_node_description, jaccard) # get all the shortest paths between source and target all_paths = nx.all_shortest_paths( g, source_node_number, target_id2numbers[other_disease_ID]) # get all the nodes on these paths try: rel_nodes = set() for path in all_paths: for node in path: rel_nodes.add(node) if rel_nodes: # extract the relevant subgraph sub_g = nx.subgraph(g, rel_nodes) # add it to the response response.add_subgraph(sub_g.nodes(data=True), sub_g.edges(data=True), to_print, jaccard) except: pass response.print()
def answer(disease_id, use_json=False, num_show=20): num_diseases_to_select = 10 # number of diseases with shared phenotypes to keep num_omim_keep = 10 # number of genetic conditions to keep num_proteins_keep = 10 # number of proteins implicated in diseases to keep num_pathways_keep = 10 # number of relevant pathways to keep num_proteins_in_pathways_keep = 10 # number of proteins in those pathways to keep num_drugs_keep = 10 # number of drugs that target those proteins to keep # The kinds of paths we're looking for path_type = [ "gene_mutations_contribute_to", "protein", "participates_in", "pathway", "participates_in", "protein", "physically_interacts_with", "chemical_substance" ] # Initialize the response class response = FormatOutput.FormatResponse(6) # get the description of the disease disease_description = RU.get_node_property(disease_id, 'name') # What are the defining symptoms of the disease? # get diseases that have many raw symptoms in common # select top N of them # get subraph of these with the input disease # weight by COHD data # pick diseases with maximal (since frequency) average distance i.e. maximal expected graph distance # get disease that have many raw symptoms in common similar_nodes_in_common = SimilarNodesInCommon.SimilarNodesInCommon() node_jaccard_tuples_sorted, error_code, error_message = similar_nodes_in_common.get_similar_nodes_in_common_source_target_association( disease_id, "disease", "phenotypic_feature", 0) # select the omims diseases_selected = [] for n, j in node_jaccard_tuples_sorted: if n.split(":")[0] == "OMIM": diseases_selected.append(n) # if we found no genetic conditions, add error message and quit if not diseases_selected: response.add_error_message( "NoGeneticConditions", "There appears to be no genetic conditions with phenotypes in common with %s" % disease_description) response.print() return # subset to top N omims that actually have the relationship types that we want: num_selected = 0 diseases_selected_on_desired_path = [] for selected_disease in diseases_selected: if RU.paths_of_type_source_fixed_target_free_exists( selected_disease, "disease", path_type, limit=1): diseases_selected_on_desired_path.append(selected_disease) num_selected += 1 if num_selected >= num_omim_keep: break diseases_selected = diseases_selected_on_desired_path # Find most representative symptoms by consulting COHD. TODO: see if this actually helps anything # get subgraph of these with the input disease # get all symptoms of input disease # all_symptoms = set() # for selected_disease in diseases_selected: # intermediate_phenotypes = RU.get_intermediate_node_ids(disease_id, "disease", "has_phenotype", "phenotypic_feature", "has_phenotype", selected_disease, "disease") # all_symptoms.update(intermediate_phenotypes) # turn it back into a list # all_symptoms = list(all_symptoms) # get the subgraph of all relevant symptoms, the omims selected, and the input disease # g = RU.get_graph_from_nodes(all_symptoms + diseases_selected + [disease_id], edges=True) # weight by COHD data (if you want to) # RU.weight_disease_phenotype_by_cohd(g, max_phenotype_oxo_dist=2) # sort by COHD freq # disease_path_weight_sorted = RU.get_sorted_path_weights_disease_to_disease(g, disease_id) # genetic_diseases_selected = [] # num_omim = 0 # for id, weight in disease_path_weight_sorted: # if id.split(":")[0] == "OMIM": # genetic_diseases_selected.append(id) # num_omim += 1 # if num_omim >= num_omim_keep: # break # in the mean-time, use them all genetic_diseases_selected = diseases_selected # select representative diseases # Do nothing for now (use all of them) # get drugs that are connected along the paths we want and count how many such paths there are genetic_diseases_to_chemical_substance_dict = dict() for selected_disease in genetic_diseases_selected: res = RU.count_paths_of_type_source_fixed_target_free( selected_disease, "disease", path_type, limit=num_drugs_keep) # add it to our dictionary genetic_diseases_to_chemical_substance_dict[selected_disease] = res # get the unique drugs drug_counts_tuples = [ item for items in genetic_diseases_to_chemical_substance_dict.values() for item in items ] drugs_path_counts = dict() for drug, count in drug_counts_tuples: if drug not in drugs_path_counts: drugs_path_counts[drug] = count else: drugs_path_counts[drug] += count # put them as tuples in a list, sorted by the ones with the most paths drugs_path_counts_tuples = [] for drug in drugs_path_counts.keys(): count = drugs_path_counts[drug] drugs_path_counts_tuples.append((drug, count)) drugs_path_counts_tuples.sort(key=lambda x: x[1], reverse=True) if not use_json: #for drug, count in drugs_path_counts_tuples: # name = RU.get_node_property(drug, "name", node_label="chemical_substance") # print("%s (%s): %d" % (name, drug, count)) print("source,target") for drug, count in drugs_path_counts_tuples: drug_old_curie = drug.split(":")[1].replace("L", "L:").replace( "H", "h") print("%s,%s" % (drug_old_curie, disease_id))
def answer(drug_id, use_json=False, num_show=20, rev=True, conservative=True): """ Answers the question 'what diseases does $drug commonly treat?' :param disease_id: KG disease node name :param use_json: bool, use JSON output :param num_show: int, number to display :param rev: bool. order by most frequent :param conservative: bool, True if using exact matches, False if using any synonyms returned by COHD :return: none """ # Initialize the response class response = FormatOutput.FormatResponse(6) # get the description drug_description = RU.get_node_property(drug_id, 'name', name_type='id') # Get the conditions that COHD says it's used to treat conditions_treated = COHDUtilities.get_conditions_treating( drug_description, conservative=conservative) # sort the diseases by frequency ids_counts = [] for id in conditions_treated: cond = conditions_treated[id] ids_counts.append((id, cond['concept_count'])) ids_counts_sorted = sorted(ids_counts, key=lambda x: x[1], reverse=rev) ids_sorted = [i[0] for i in ids_counts_sorted] # reduce to top n ids_sorted_top_n = ids_sorted if len(ids_sorted_top_n) > num_show: ids_sorted_top_n = ids_sorted_top_n[0:num_show] # return the results if not use_json: if rev: to_print = "The most common conditions " else: to_print = "The least common conditions " to_print += "treated with %s, according to the Columbia Open Health Data, are:\n" % drug_description for id in ids_sorted_top_n: to_print += "condition: %s\t count %d \t frequency %f \n" % ( conditions_treated[id]['associated_concept_name'], conditions_treated[id]['concept_count'], conditions_treated[id]['concept_frequency']) print(to_print) else: # otherwise, you want a JSON output # Attempt to map the COHD names to the KG (this takes some time)l. TODO: find further speed improvements drug_as_graph = RU.get_node_as_graph(drug_id) drug_node_info = list(drug_as_graph.nodes(data=True))[0][1] id_to_KG_name = dict() id_to_name = dict() id_to_count = dict() id_to_frequency = dict() id_to_id = dict() # Map ID's to all relevant values for id in ids_sorted_top_n: id_to_name[id] = conditions_treated[id][ 'associated_concept_name'] id_to_count[id] = conditions_treated[id]['concept_count'] id_to_frequency[id] = conditions_treated[id][ 'concept_frequency'] id_to_KG_name[id] = None try: id_to_KG_name[id] = RU.get_id_from_property( id_to_name[id], 'name', label="phenotypic_feature") id_to_id[id_to_KG_name[id]] = id except: try: id_to_KG_name[id] = RU.get_id_from_property( id_to_name[id], 'name', label="disease") id_to_id[id_to_KG_name[id]] = id except: try: id_to_KG_name[id] = RU.get_id_from_property( id_to_name[id].lower(), 'name', label="phenotypic_feature") id_to_id[id_to_KG_name[id]] = id except: try: id_to_KG_name[id] = RU.get_id_from_property( id_to_name[id].lower(), 'name', label="disease") id_to_id[id_to_KG_name[id]] = id except: pass # get the graph (one call) of all the nodes that wer mapped KG_names = [] for id in ids_sorted_top_n: if id_to_KG_name[id] is not None: KG_names.append(id_to_KG_name[id]) if not KG_names: error_message = "Sorry, Columbia Open Health Data has no data on the use of %s" % drug_description error_code = "EmptyResult" response.add_error_message(error_code, error_message) response.print() return 1 all_conditions_graph = RU.get_graph_from_nodes(KG_names) # Get the info of the mapped nodes id_to_info = dict() for u, data in all_conditions_graph.nodes(data=True): id = data['properties']['id'] id = id_to_id[id] id_to_info[id] = data # for each condition, return the results (with the nice sub-graph if the cohd id's were mapped) for id in ids_sorted_top_n: if id_to_KG_name[id] is not None: to_print = "According to the Columbia Open Health Data, %s is used to treat patients with the condition %s with frequency " \ "%f out of all patients treated with %s (count=%d)." % ( drug_description, id_to_name[id], id_to_frequency[id], drug_description, id_to_count[id]) nodes = [] disease_node_info = id_to_info[id] nodes.append((2, disease_node_info)) nodes.append((1, drug_node_info)) edges = [(1, 2, { 'id': 3, 'properties': { 'is_defined_by': 'RTX', 'predicate': 'treats', 'provided_by': 'COHD', 'relation': 'treats', 'seed_node_uuid': '-1', 'source_node_uuid': drug_node_info['properties']['UUID'], 'target_node_uuid': disease_node_info['properties']['UUID'] }, 'type': 'treats' })] response.add_subgraph(nodes, edges, to_print, id_to_frequency[id]) else: to_print = "According to the Columbia Open Health Data, %s is used to treat patients with the condition %s with frequency " \ "%f out of all patients treated with %s (count=%d). This condition is not in our " \ "Knowledge graph, so no graph is shown." % ( drug_description, id_to_name[id], id_to_frequency[id], drug_description, id_to_count[id]) g = RU.get_node_as_graph(drug_id) response.add_subgraph(g.nodes(data=True), g.edges(data=True), to_print, id_to_frequency[id]) response.print()
def answer(tissue_id, input_protein_list, use_json=False, num_show=20, rev=True): # Initialize the response class response = FormatOutput.FormatResponse(6) # Make sure everything exists in the graph if not RU.node_exists_with_property(tissue_id, "id"): tissue_id = RU.get_node_property(tissue_id, "id", node_label="anatomical_entity") for i in range(len(input_protein_list)): id = input_protein_list[i] if not RU.node_exists_with_property(id, "id"): input_protein_list[i] = RU.get_node_property( id, "id", node_label="protein") # Initialize the QueryLilGim class q = QueryLilGIM.QueryLilGIM() # get the description tissue_description = RU.get_node_property( tissue_id, 'name', node_label="anatomical_entity") # Get the correlated proteins try: correlated_proteins_dict = q.query_neighbor_genes_for_gene_set_in_a_given_anatomy( tissue_id, tuple(input_protein_list)) #correlated_proteins_dict = {'UniProtKB:Q99618': 0.4276333333333333, 'UniProtKB:Q92698': 0.464, 'UniProtKB:P56282': 0.5810000000000001, 'UniProtKB:P49454': 0.4441, 'UniProtKB:P49642': 0.5188333333333334, 'UniProtKB:Q9BZD4': 0.5042666666666668, 'UniProtKB:P38398': 0.4464, 'UniProtKB:Q9BXL8': 0.5009, 'UniProtKB:P42166': 0.4263000000000001, 'UniProtKB:Q96CS2': 0.5844333333333332, 'UniProtKB:Q9BQP7': 0.4903333333333333, 'UniProtKB:O95997': 0.4743333333333333, 'UniProtKB:Q9H4K1': 0.4709, 'UniProtKB:Q9H967': 0.5646666666666667, 'UniProtKB:Q12834': 0.4478, 'UniProtKB:Q71F23': 0.4361, 'UniProtKB:Q9UQ84': 0.4800666666666666, 'UniProtKB:Q9NSP4': 0.4347} except: error_message = "Lil'GIM is experiencing a problem." error_code = "LilGIMerror" response.add_error_message(error_code, error_message) response.print() return 1 # as a list of tuples correlated_proteins_tupes = [] for k, v in correlated_proteins_dict.items(): correlated_proteins_tupes.append((k, v)) # sort by freq correlated_proteins_tupes_sorted = sorted(correlated_proteins_tupes, key=lambda x: x[1], reverse=rev) correlated_proteins_tupes_sorted = correlated_proteins_tupes_sorted[ 0:num_show] correlated_proteins_tupes = correlated_proteins_tupes_sorted # return the results if not use_json: try: protein_descriptions = RU.get_node_property( input_protein_list[0], "name", node_label="protein", name_type="id") except: protein_descriptions = input_protein_list[0] for id in input_protein_list[1:-1]: protein_descriptions += ", " try: protein_descriptions += RU.get_node_property( id, "name", node_label="protein", name_type="id") except: protein_descriptions += id if len(input_protein_list) > 1: try: protein_descriptions += ", and %s" % RU.get_node_property( input_protein_list[-1], "name", node_label="protein", name_type="id") except: protein_descriptions += ", and %s" % input_protein_list[-1] if rev: to_print = "In the tissue: %s, the proteins that correlate most with %s" % ( tissue_description, protein_descriptions) else: to_print = "In the tissue: %s, the proteins that correlate least with %s" % ( tissue_description, protein_descriptions) to_print += " according to Lil'GIM, are:\n" for id, val in correlated_proteins_tupes_sorted: try: to_print += "protein: %s\t correlation %f\n" % ( RU.get_node_property( id, "name", node_label="protein", name_type="id"), val) except: to_print += "protein: %s\t correlation %f\n" % (id, val) print(to_print) else: # otherwise, you want a JSON output protein_descriptions = [] is_in_KG_list = [] for protein, corr in correlated_proteins_tupes: try: description = RU.get_node_property(protein, "name", node_label="protein", name_type="id") protein_descriptions.append(description) is_in_KG_list.append(True) except: protein_description = protein protein_descriptions.append(protein_description) is_in_KG_list.append(False) # just get the ones that are actually in the KG. TODO: do something with the ones that are not in the KG correlated_proteins_tupes_in_KG = [] for i in range(len(correlated_proteins_tupes)): if is_in_KG_list[i]: correlated_proteins_tupes_in_KG.append( correlated_proteins_tupes[i]) # Return the results full_g = RU.get_graph_from_nodes( [id for id, val in correlated_proteins_tupes_in_KG], node_property_label="id") id2node = dict() for nx_id, node in full_g.nodes(data=True): id2node[node['properties']['id']] = node for id, corr in correlated_proteins_tupes_in_KG: to_print = "In the tissue: %s, the protein %s has correlation %f with the given list of proteins." % ( tissue_description, RU.get_node_property( id, "name", node_label="protein", name_type="id"), corr) response.add_subgraph([(id, id2node[id])], [], to_print, corr) response.print()