def euclidean_distance( self, profile_a: Iterable[str], profile_b: Iterable[str], predicate: Optional[URIRef] = RDFS['subClassOf']) -> float: """ Groupwise euclidean distance The euclidean distance between two vectors of IC values, where a vector is created by taking the union of phenotypes in two profiles (including parents of each phenotype) This is roughly analogous to, but the not the inverse of simGIC """ # Filter out negative phenotypes profile_a = {pheno for pheno in profile_a if not pheno.startswith("-")} profile_b = {pheno for pheno in profile_b if not pheno.startswith("-")} a_closure = owl_utils.get_profile_closure( profile_a, self.graph, self.root, predicate) b_closure = owl_utils.get_profile_closure( profile_b, self.graph, self.root, predicate) all_phenotypes = a_closure.union(b_closure) a_vector = np.array([self.ic_map[item] if item in a_closure else 0 for item in all_phenotypes]) b_vector = np.array([self.ic_map[item] if item in b_closure else 0 for item in all_phenotypes]) return np.linalg.norm(a_vector - b_vector)
def groupwise_jaccard( self, profiles: Iterable[Iterable[str]], predicate: Optional[URIRef] = RDFS['subClassOf']) -> float: """ Groupwise groupwise resnik similarity assumes no negative phenotypes """ # Filter out negative phenotypes profile_union = set() profile_intersection = set() is_first = True for profile in profiles: profile_union = profile_union.union( owl_utils.get_profile_closure(profile, self.graph, self.root, predicate)) if is_first: profile_intersection = owl_utils.get_profile_closure( profile, self.graph, self.root, predicate) is_first = False else: profile_intersection = profile_intersection.intersection( owl_utils.get_profile_closure(profile, self.graph, self.root, predicate)) return len(profile_intersection) / len(profile_union)
def groupwise_sim_gic( self, profiles: Iterable[Iterable[str]], predicate: Optional[URIRef] = RDFS['subClassOf']) -> float: """ Groupwise groupwise resnik similarity assumes no negative phenotypes """ # Filter out negative phenotypes profile_union = set() profile_intersection = set() is_first = True for profile in profiles: profile_union = profile_union.union( owl_utils.get_profile_closure(profile, self.graph, self.root, predicate)) if is_first: profile_intersection = owl_utils.get_profile_closure( profile, self.graph, self.root, predicate) is_first = False else: profile_intersection = profile_intersection.intersection( owl_utils.get_profile_closure(profile, self.graph, self.root, predicate)) numerator = reduce( lambda x, y: x + y, [self.ic_map[pheno] for pheno in profile_intersection]) denominator = reduce(lambda x, y: x + y, [self.ic_map[pheno] for pheno in profile_union]) return numerator / denominator
def sim_gic(self, profile_a: Iterable[str], profile_b: Iterable[str], predicate: Optional[URIRef] = RDFS['subClassOf']) -> float: """ Groupwise resnik similarity: Summed information content of common ancestors divided by summed information content of all ancestors in profile a and profile b https://bmcbioinformatics.biomedcentral.com/track/ pdf/10.1186/1471-2105-9-S5-S4 """ # Filter out negative phenotypes profile_a = {pheno for pheno in profile_a if not pheno.startswith("-")} profile_b = {pheno for pheno in profile_b if not pheno.startswith("-")} a_closure = owl_utils.get_profile_closure(profile_a, self.graph, self.root, predicate) b_closure = owl_utils.get_profile_closure(profile_b, self.graph, self.root, predicate) numerator = reduce(lambda x, y: x + y, [ self.ic_map[pheno] for pheno in a_closure.intersection(b_closure) ]) denominator = reduce( lambda x, y: x + y, [self.ic_map[pheno] for pheno in a_closure.union(b_closure)]) return numerator / denominator
def jaccard_sim(self, profile_a: Iterable[str], profile_b: Iterable[str], predicate: Optional[URIRef] = RDFS['subClassOf']) -> float: """ Groupwise jaccard similarty Negative phenotypes must be prefixed with a '-' """ # Filter out negative phenotypes profile_a = {pheno for pheno in profile_a if not pheno.startswith("-")} profile_b = {pheno for pheno in profile_b if not pheno.startswith("-")} pheno_a_set = owl_utils.get_profile_closure(profile_a, self.graph, self.root, predicate) pheno_b_set = owl_utils.get_profile_closure(profile_b, self.graph, self.root, predicate) return metric.jaccard(pheno_a_set, pheno_b_set)
def cosine_sim(self, profile_a: Iterable[str], profile_b: Iterable[str], ic_weighted: Optional[bool] = False, negative_weight: Optional[Num] = 1, predicate: Optional[URIRef] = RDFS['subClassOf']) -> float: """ Cosine similarity Profiles are treated as vectors of numbers between 0-1: 1: Phenotype present 0: Absent (no information) 1 * negative weight: Negated phenotypes if ic_weighted is true the attributes become vectors of information content scores Inferred phenotypes are computed as parent classes for positive phenotypes and child classes for negative phenotypes. Typically we do not want to weight negative phenotypes as high as positive phenotypes. A weight between .01-.1 may be desirable """ def score(term): if ic_weighted: attribute = self.ic_map[term] else: attribute = 1 return attribute positive_a_profile = { item for item in profile_a if not item.startswith('-') } negative_a_profile = { item[1:] for item in profile_a if item.startswith('-') } positive_b_profile = { item for item in profile_b if not item.startswith('-') } negative_b_profile = { item[1:] for item in profile_b if item.startswith('-') } pos_a_closure = owl_utils.get_profile_closure(positive_a_profile, self.graph, self.root, predicate) pos_b_closure = owl_utils.get_profile_closure(positive_b_profile, self.graph, self.root, predicate) neg_a_closure = { "-{}".format(item) for item in owl_utils.get_profile_closure(negative_a_profile, self.graph, self.root, predicate, negative=True) } neg_b_closure = { "-{}".format(item) for item in owl_utils.get_profile_closure(negative_b_profile, self.graph, self.root, predicate, negative=True) } pos_intersect_dot_product = reduce(lambda x, y: x + y, [ math.pow(score(item), 2) for item in pos_a_closure.intersection(pos_b_closure) ], 0) neg_intersect_dot_product = reduce(lambda x, y: x + y, [ math.pow(score(item) * negative_weight, 2) for item in neg_a_closure.intersection(neg_b_closure) ], 0) a_square_dot_product = math.sqrt( reduce(lambda x, y: x + y, [math.pow(score(item), 2) for item in pos_a_closure], 0) + reduce(lambda x, y: x + y, [ math.pow(score(item) * negative_weight, 2) for item in neg_a_closure ], 0)) b_square_dot_product = math.sqrt( reduce(lambda x, y: x + y, [math.pow(score(item), 2) for item in pos_b_closure], 0) + reduce(lambda x, y: x + y, [ math.pow(score(item) * negative_weight, 2) for item in neg_b_closure ], 0)) numerator = pos_intersect_dot_product + neg_intersect_dot_product denominator = a_square_dot_product * b_square_dot_product return numerator / denominator