def get_mica_ic(pheno_a: str, pheno_b: str, graph: Graph, ic_map: Dict[str, float], root) -> float: predicate = RDFS['subClassOf'] p1_closure = owl_utils.get_closure(graph, pheno_a, predicate, root) p2_closure = owl_utils.get_closure(graph, pheno_b, predicate, root) return max( [ic_map[parent] for parent in p1_closure.intersection(p2_closure)])
def load_map_from_file(file_path: str) -> Dict[str, Set[str]]: profile_map: Dict[str, Set[str]] = {} with open(file_path, 'r') as annotations: for line in annotations: if line.startswith('#') or not line.startswith('MONDO'): continue disease, phenotype = line.rstrip("\n").split("\t")[0:2] try: profile_map[disease].add(phenotype) except KeyError: profile_map[disease] = {phenotype} for pheno in get_closure(hpo, phenotype, root='HP:0000118', reflexive=False): profile_map[disease].add(pheno) return profile_map
def simulate_from_derived( pheno_profile: Set[str], pheno_subset: Set[str], graph: Graph, root: str, ic_values: Dict[str, float], filter_out: Set[str], ref_disease: Optional[str]=None) -> FrozenSet[str]: """ Add imprecision and noise to profile 20% omit phenotype - omissions 10% use closest parent - imprecision 30% add random phenotype - noise, min 1 :return: FrozenSet[str] - set of phenotype curies """ omission_rate = .2 # .4 for gold, .2 for derived imprecision_rate = .1 # .3 for gold, .1 for derived noise_rate = .3 phenotypes = list(pheno_profile) profile_size = len(phenotypes) # Remove x percent of phenotypes count_to_remove = round(profile_size * omission_rate) phenotypes = random.sample(phenotypes, profile_size - count_to_remove) # mutate x percent to closest parent count_to_mutate = round(profile_size * imprecision_rate) random.shuffle(phenotypes) counter = 0 for idx, pheno in enumerate(phenotypes): if counter == count_to_mutate: break parents = get_closure(graph, pheno, RDFS['subClassOf'], root, False) lay_overlap = parents.intersection(pheno_subset).difference(pheno_profile, phenotypes) if len(list(lay_overlap)) == 0: continue max_ic = max([ic_values[parent] for parent in lay_overlap]) mica = '' for phen in lay_overlap: if ic_values[phen] == max_ic: mica = phen phenotypes[idx] = mica counter += 1 if counter != count_to_mutate: logging.info("Could not mutate profile derived from {}".format(ref_disease)) # add random phenotype(s) # Filter out phenotypes from filter_out set phenos_to_select = pheno_subset.difference(filter_out, phenotypes, pheno_profile) if len(list(phenos_to_select)) == 0: logging.warning("No phenotypes to select for " "profile derived from {}".format(ref_disease)) comissions = round(profile_size * noise_rate) noise_count = 1 if comissions == 0 else comissions for i in range(noise_count): random_pheno = random.choice(list(phenos_to_select)) phenotypes.append(random_pheno) phenos_to_select.remove(random_pheno) return frozenset(phenotypes)
pheno_profile, mondo_label = monarch.get_direct_phenotypes(mondo) gold_standard[mondo] = pheno_profile for mondo in diseases: # If list is not mondo # clique_leader = monarch.get_clique_leader(disease) # mondo = clique_leader['id'] # mondo_label = clique_leader['label'] # Get phenotypes gold_profile = set(gold_standard[mondo]) derived_profile = gold_profile.intersection(lay_terms) non_lay_terms = gold_profile - derived_profile for phenotype in non_lay_terms: parents = owl_utils.get_closure(hpo, phenotype, RDFS['subClassOf'], root) lay_overlap = parents.intersection(lay_terms) if len(lay_overlap) == 0: continue max_ic = max([ic_map[parent] for parent in lay_overlap]) mica = '' for pheno in lay_overlap: if ic_map[pheno] == max_ic: mica = pheno derived_profile.add(mica) for phenotype in derived_profile: output.write("{}\t{}\n".format(mondo, phenotype))
def main(): parser = argparse.ArgumentParser( description='Generate information content for each HPO class using the ' 'HPO phenotype annotation file ') parser.add_argument('--mondo_cache', '-m', type=str, required=True, help='Cached 2 column disease phenotype tsv') parser.add_argument('--output', '-o', type=str, required=False, help='Location of output file', default="./mondo-ic-cache.tsv") args = parser.parse_args() # i/o output_file = open(args.output, 'w') explicit_annotations = 1 disease_annotations: Dict[str, int] = {} mondo_graph = Graph() # Previous cache made with 2018-08-03 version of mondo logger.info("Loading MONDO") mondo_graph.parse( "/path/to/git/mondo-2018-08-03/src/ontology/reasoned.owl", format='xml') root = "MONDO:0000001" logger.info("Getting classes") all_diseases = owl_utils.get_descendants(mondo_graph, root) disease_annotations = {disease: 0 for disease in all_diseases} logger.info("Seeding leaf nodes") # Seed leaf nodes with 1 annotation for leaf in owl_utils.get_leaf_nodes(mondo_graph, root): explicit_annotations += 1 for disease in owl_utils.get_closure(mondo_graph, leaf, root=root): try: disease_annotations[disease] += 1 except KeyError: print(disease) disease_annotations[disease] = 1 logger.info("Fetching annotations") with open(args.mondo_cache, 'r') as cache_file: reader = csv.reader(cache_file, delimiter='\t', quotechar='\"') for row in reader: if row[0].startswith('#'): continue if not row[0].startswith('MONDO'): continue (mondo_id, phenotype_id) = row[0:2] explicit_annotations += 1 for disease in owl_utils.get_closure(mondo_graph, mondo_id, root=root): try: disease_annotations[disease] += 1 except KeyError: print(disease) disease_annotations[disease] = 1 logger.info("Computing IC") for disease, annot_count in disease_annotations.items(): output_file.write("{}\t{}\n".format( disease, math_utils.information_content(annot_count / explicit_annotations)))
def pairwise_jaccard(pheno_a: str, pheno_b: str, graph: Graph, root: str) -> float: predicate = RDFS['subClassOf'] return jaccard(owl_utils.get_closure(graph, pheno_a, predicate, root), owl_utils.get_closure(graph, pheno_b, predicate, root))
mondo = Graph() mondo.parse(gzip.open("../data/owl/mondo.owl.gz", 'rb'), format='xml') with open(args.mondo_assoc, 'r') as mondo_labels: for line in mondo_labels: if line.startswith('#'): continue if not line.startswith('MONDO'): continue if counter % 10000 == 0: print("Processed {} associations".format(counter)) disease, phenotype = line.rstrip("\n").split("\t")[0:2] try: mondo_diseases[disease] = mondo_diseases_tmp[disease] except KeyError: mondo_diseases[disease] = "obsoleted class" disease_closure = get_closure(mondo, disease, root='MONDO:0000001') for dis in disease_closure: try: mondo_diseases[dis] = mondo_diseases_tmp[dis] except KeyError: mondo_diseases[dis] = "obsoleted class" if include_inferred: phenotype_closure = get_closure(hpo, phenotype, root='HP:0000118') associations.append((disease_closure, phenotype_closure)) else: associations.append((disease_closure, {phenotype})) counter += 1
def main(): """ Cluster and iterate over each cluster to find the best disease group that subsumes the cluster """ parser = argparse.ArgumentParser(description='description') parser.add_argument('--input', '-i', type=str, required=True, help='Location of input file' ' that contains the sim matrix as json') parser.add_argument('--label', '-l', type=str, required=True, help='Location of id-label mapping file') parser.add_argument('--ic_cache', '-ic', type=str, required=True) parser.add_argument('--output', '-o', required=False, help='output file') args = parser.parse_args() logger.info("loading matrix") matrix = np.loadtxt(args.input, delimiter=",") labels = [ line.rstrip('\n').split('\t')[0] for line in open(args.label, 'r') ] ic_fh = open(args.ic_cache, 'r') output = open(args.output, 'w') output.write("#distance\tlinkage\tmean_mica\tmedian_mica\t" "num_clusters\tmean_mem\tmedian_mem\tsingletons\n") ic_map: Dict[str, float] = {} for line in ic_fh.readlines(): hpo_id, ic = line.rstrip("\n").split("\t") ic_map[hpo_id] = float(ic) ic_fh.close() mondo_graph = Graph() logger.info("loading mondo") # Previous cache made with 2018-08-03 version of mondo mondo_graph.parse( "/path/to/git/mondo-2018-08-03/src/ontology/reasoned.owl", format='xml') root = "MONDO:0000001" mondo_skip = { 'MONDO:0023807', 'MONDO:0000559', 'MONDO:0009117', 'MONDO:0016961', 'MONDO:0011750', 'MONDO:0017180', } cluster_map = {} logger.info("clustering") Z = linkage(squareform(matrix), 'ward') # cosine weighted = 2631 # resnik = 2453 # euclidean = 525 clusters = fcluster(Z, 525, 'maxclust') for disease_id, cluster_id in zip(labels, clusters): try: cluster_map[cluster_id].append(disease_id) except KeyError: cluster_map[cluster_id] = [disease_id] for cluster_id, diseases in cluster_map.items(): if len(set(diseases).intersection(mondo_skip)) > 0: if len(diseases) == len(set(diseases).intersection(mondo_skip)): logger.warning("Cannot evaluate cluster") diseases = set(diseases) else: diseases = set(diseases) - mondo_skip common_ancestors = set() is_first = True for disease in diseases: if is_first: common_ancestors = owl_utils.get_closure(mondo_graph, disease, root=root) is_first = False else: common_ancestors = common_ancestors.intersection( owl_utils.get_closure(mondo_graph, disease, root=root)) mica = max([ic_map[d] for d in common_ancestors]) for dis in common_ancestors: if ic_map[dis] == mica: mica_id = dis label = monarch.get_label(mica_id) # Number of subclasses for dis subclass_count = len( owl_utils.get_closure(mondo_graph, mica_id, negative=True, reflexive=False)) output.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( cluster_id, len(cluster_map[cluster_id]), mica_id, label, mica, subclass_count, "|".join(cluster_map[cluster_id])))