Exemplo n.º 1
0
    def answer(self, disease_ID, use_json=False, threshold=0.2):
        """
		Answer the question: what other diseases have similarity >= jaccard=0.2 with the given disease_ID
		(in terms of phenotype overlap)
		:param disease_ID: KG disease name (eg. DOID:8398)
		:param use_json: use the standardized output format
		:param threshold: only include diseases with Jaccard index above this
		:return: None (print to stdout), unless there's an error, then return 1
		"""
        # Initialize the response class
        response = FormatOutput.FormatResponse(4)

        # Check if node exists
        if not RU.node_exists_with_property(disease_ID, 'name'):
            error_message = "Sorry, the disease %s is not yet in our knowledge graph." % disease_ID
            error_code = "DiseaseNotFound"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # Get label/kind of node the source is
        disease_label = RU.get_node_property(disease_ID, "label")
        if disease_label != "disease" and disease_label != "disease":
            error_message = "Sorry, the input has label %s and needs to be one of: disease, disease." \
                " Please try a different term" % disease_label
            error_code = "NotADisease"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # get the description
        disease_description = RU.get_node_property(disease_ID, 'description')

        # get the phenotypes associated to the disease
        disease_phenotypes = RU.get_one_hop_target(disease_label, disease_ID,
                                                   "phenotypic_feature",
                                                   "has_phenotype")

        # Look more steps beyond if we didn't get any physically_interacts_with
        if disease_phenotypes == []:
            for max_path_len in range(2, 5):
                disease_phenotypes = RU.get_node_names_of_type_connected_to_target(
                    disease_label,
                    disease_ID,
                    "phenotypic_feature",
                    max_path_len=max_path_len,
                    direction="u")
                if disease_phenotypes:
                    break

        # Make sure you actually picked up at least one phenotype
        if not disease_phenotypes:
            error_message = "No phenotypes found for this disease."
            error_code = "NoPhenotypesFound"
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1
        disease_phenotypes_set = set(disease_phenotypes)

        # get all the other disease that connect and get the phenotypes in common
        # direct connection
        node_label_list = ["phenotypic_feature"]
        relationship_label_list = ["has_phenotype", "has_phenotype"]
        node_of_interest_position = 0
        other_disease_IDs_to_intersection_counts = dict()
        for target_label in ["disease", "disease"]:
            names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(
                disease_ID, disease_label, target_label, node_label_list,
                relationship_label_list, node_of_interest_position)
            for ID in names2counts.keys():
                if names2counts[ID] / float(
                        len(disease_phenotypes_set)
                ) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
                    other_disease_IDs_to_intersection_counts[
                        ID] = names2counts[ID]

        # check if any other diseases passed the threshold
        if not other_disease_IDs_to_intersection_counts:
            error_code = "NoDiseasesFound"
            error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
            parent = RU.get_one_hop_target(disease_label,
                                           disease_ID,
                                           disease_label,
                                           "subclass_of",
                                           direction="r")
            if parent:
                parent = parent.pop()
                error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                    RU.get_node_property(parent,
                                         'description'), disease_description)
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                response.print()
                return 1

        # Now for each of the diseases connecting to source, count number of phenotypes
        node_label_list = ["phenotypic_feature"]
        relationship_label_list = ["has_phenotype", "has_phenotype"]
        node_of_interest_position = 0
        other_doid_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(
            disease_ID, disease_label, "disease", node_label_list,
            relationship_label_list, node_of_interest_position)
        other_omim_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(
            disease_ID, disease_label, "disease", node_label_list,
            relationship_label_list, node_of_interest_position)
        # union the two
        other_disease_counts = dict()
        for key in other_doid_counts.keys():
            other_disease_counts[key] = other_doid_counts[key]
        for key in other_omim_counts.keys():
            other_disease_counts[key] = other_omim_counts[key]

        # then compute the jaccard index
        disease_jaccard_tuples = []
        for other_disease_ID in other_disease_counts.keys():
            jaccard = 0
            if other_disease_ID in other_disease_IDs_to_intersection_counts:
                union_card = len(disease_phenotypes) + other_disease_counts[other_disease_ID] - \
                    other_disease_IDs_to_intersection_counts[other_disease_ID]
                jaccard = other_disease_IDs_to_intersection_counts[
                    other_disease_ID] / float(union_card)
            if jaccard > threshold:
                disease_jaccard_tuples.append((other_disease_ID, jaccard))

        # Format the results.
        # Maybe nothing passed the threshold
        if not disease_jaccard_tuples:
            error_code = "NoDiseasesFound"
            error_message = "No diseases were found with similarity crossing the threshold of %f." % threshold
            parent = RU.get_one_hop_target(disease_label,
                                           disease_ID,
                                           disease_label,
                                           "subclass_of",
                                           direction="r")
            if parent:
                parent = parent.pop()
                error_message += "\n Note that %s is a parent disease to %s, so you might try that instead." % (
                    RU.get_node_property(parent,
                                         'description'), disease_description)
            if not use_json:
                print(error_message)
                return 1
            else:
                response.add_error_message(error_code, error_message)
                return 1

        # Otherwise there are results to return, first sort them largest to smallest
        disease_jaccard_tuples_sorted = [(x, y) for x, y in sorted(
            disease_jaccard_tuples, key=lambda pair: pair[1], reverse=True)]
        if not use_json:
            to_print = "The diseases with phenotypes similar to %s are: \n" % disease_description
            for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
                to_print += "%s\t%s\tJaccard %f\n" % (
                    other_disease_ID,
                    RU.get_node_property(other_disease_ID,
                                         'description'), jaccard)
            print(to_print)
        else:
            for other_disease_ID, jaccard in disease_jaccard_tuples_sorted:
                to_print = "%s is phenotypically similar to the disease %s with similarity value %f" % (
                    disease_description,
                    RU.get_node_property(other_disease_ID,
                                         'description'), jaccard)
                g = RU.get_node_as_graph(other_disease_ID)
                response.add_subgraph(g.nodes(data=True), g.edges(data=True),
                                      to_print, jaccard)
            response.print()
Exemplo n.º 2
0
	def get_similar_nodes_in_common(input_node_ID, input_node_label, association_node_label, input_association_relationship,
				target_association_relationship, target_node_label, threshold=0.2):
		"""
		This function returns the nodes that are associated with an input node based on Jaccard index similarity of
		shared intermediate nodes
		:param input_node_ID: input node ID (in KG)
		:param input_node_label: label of the input node
		:param association_node_label: what kind of node you want to calculate the Jaccard index with
		:param input_association_relationship: how the input node is connected to the association nodes
		:param target_association_relationship: how the target node is connected to the association node
		:param target_node_label: what kind of target nodes to return
		:param threshold: threshold to compute the Jaccard index
		:return: a list of tuples, an error_code, and an error_message. tuple[0] is a target node with tuple[1] jaccard index based on association nodes
		"""
		# get the description
		input_node_description = RU.get_node_property(input_node_ID, 'name')

		# get the nodes associated to the input node
		input_node_associated_nodes = RU.get_one_hop_target(input_node_label, input_node_ID, association_node_label,
															input_association_relationship)

		# Look more steps beyond if we didn't get any physically_interacts_with
		if input_node_associated_nodes == []:
			for max_path_len in range(2, 5):
				input_node_associated_nodes = RU.get_node_names_of_type_connected_to_target(input_node_label, input_node_ID,
																			association_node_label,
																			max_path_len=max_path_len,
																			direction="u")
				if input_node_associated_nodes:
					break

		# Make sure you actually picked up at least one associated node
		if not input_node_associated_nodes:
			error_code = "NoNodesFound"
			error_message = "No %s found for %s." % (association_node_label, input_node_description)
			return [], error_code, error_message

		input_node_associated_nodes_set = set(input_node_associated_nodes)

		# get all the other disease that connect and get the association nodes in common
		# direct connection
		node_label_list = [association_node_label]
		relationship_label_list = [input_association_relationship, target_association_relationship]
		node_of_interest_position = 0
		other_node_IDs_to_intersection_counts = dict()
		#if target_node_label == "disease" or target_node_label == "disease":
		#	target_labels = ["disease", "disease"]
		#else:
		target_labels = [target_node_label]
		for target_label in target_labels:
			names2counts, names2nodes = RU.count_nodes_of_type_on_path_of_type_to_label(input_node_ID, input_node_label,
																						target_label, node_label_list,
																						relationship_label_list,
																						node_of_interest_position)
			for ID in names2counts.keys():
				if names2counts[ID] / float(len(
						input_node_associated_nodes_set)) >= threshold:  # if it's below this threshold, no way the Jaccard index will be large enough
					other_node_IDs_to_intersection_counts[ID] = names2counts[ID]

		# check if any other associated nodes passed the threshold
		if not other_node_IDs_to_intersection_counts:
			error_code = "NoNodesFound"
			error_message = "No %s were found with similarity crossing the threshold of %f." % (target_node_label, threshold)
			parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r")
			if parent:
				parent = parent.pop()
				error_message += "\n Note that %s is a parent of %s, so you might try that instead." % (
				RU.get_node_property(parent, 'name'), input_node_description)
			return [], error_code, error_message

		# Now for each of the nodes connecting to source, count number of association nodes
		node_label_list = [association_node_label]
		relationship_label_list = [input_association_relationship, target_association_relationship]
		node_of_interest_position = 0
		other_node_counts = dict()
		for target_label in target_labels:
			temp_other_counts = RU.count_nodes_of_type_for_nodes_that_connect_to_label(input_node_ID, input_node_label,
																					   target_label, node_label_list,
																					   relationship_label_list,
																					   node_of_interest_position)
			# add it to the dictionary
			for key in temp_other_counts.keys():
				other_node_counts[key] = temp_other_counts[key]

		# then compute the jaccard index
		node_jaccard_tuples = []
		for other_node_ID in other_node_counts.keys():
			jaccard = 0
			if other_node_ID in other_node_IDs_to_intersection_counts:
				union_card = len(input_node_associated_nodes) + other_node_counts[other_node_ID] - \
							other_node_IDs_to_intersection_counts[other_node_ID]
				jaccard = other_node_IDs_to_intersection_counts[other_node_ID] / float(union_card)
			if jaccard > threshold:
				node_jaccard_tuples.append((other_node_ID, jaccard))

		# Format the results.
		# Maybe nothing passed the threshold
		if not node_jaccard_tuples:
			error_code = "NoNodesFound"
			error_message = "No %s's were found with similarity crossing the threshold of %f." % (target_node_label, threshold)
			parent = RU.get_one_hop_target(input_node_label, input_node_ID, input_node_label, "subclass_of", direction="r")
			if parent:
				parent = parent.pop()
				error_message += "\n Note that %s is a parent of %s, so you might try that instead." % (RU.get_node_property(parent, 'description'), input_node_description)
			return [], error_code, error_message

		# Otherwise there are results to return, first sort them largest to smallest
		node_jaccard_tuples_sorted = [(x, y) for x, y in
										sorted(node_jaccard_tuples, key=lambda pair: pair[1], reverse=True)]

		return node_jaccard_tuples_sorted, None, None