def print_hpo(out_dir): """Print HPO files to a directory Args: out_dir(Path) """ hpo_file_name = "hpo.obo" hpo_file_path = out_dir / hpo_file_name LOG.info("Download HPO terms to %s", hpo_file_path) with hpo_file_path.open("w", encoding="utf-8") as outfile: for line in fetch_hpo_terms(): outfile.write(line + "\n") hpo_file_name = "ALL_SOURCES_ALL_FREQUENCIES_genes_to_phenotype.txt" hpo_file_path = out_dir / hpo_file_name LOG.info("Download HPO genes to %s", hpo_file_path) with hpo_file_path.open("w", encoding="utf-8") as outfile: for line in fetch_hpo_genes(): outfile.write(line + "\n") hpo_file_name = "ALL_SOURCES_ALL_FREQUENCIES_phenotype_to_genes.txt" hpo_file_path = out_dir / hpo_file_name LOG.info("Download HPO TO genes to %s", hpo_file_path) with hpo_file_path.open("w", encoding="utf-8") as outfile: for line in fetch_hpo_to_genes(): outfile.write(line + "\n") hpo_file_name = "ALL_SOURCES_ALL_FREQUENCIES_diseases_to_genes_to_phenotypes.txt" hpo_file_path = out_dir / hpo_file_name LOG.info("Download HPO disease %s", hpo_file_path) with hpo_file_path.open("w", encoding="utf-8") as outfile: for line in fetch_hpo_phenotype_to_terms(): outfile.write(line + "\n")
def get_reduced_hpo_terms(hpo_terms): """Return a reduced version of the hpo terms Args: hpo_terms(set(str)): Set of choosen terms that should be included Yields: hpo_line: A line with hpo information """ hpo_lines = fetch_hpo_terms() begining = True term_lines = [] # We want to keep the header lines keep = True nr_terms = 0 nr_kept = 0 for line in hpo_lines: # When we encounter a new term we yield all lines of the previous term if line.startswith("[Term]"): nr_terms += 1 if keep: nr_kept += 1 for hpo_line in term_lines: yield hpo_line keep = False term_lines = [] elif line.startswith("id"): hpo_id = line[4:] if hpo_id in hpo_terms: keep = True term_lines.append(line) if keep: for hpo_line in term_lines: yield hpo_line LOG.info("Nr of terms in file %s", nr_terms) LOG.info("Nr of terms kept: %s", nr_kept)
def test_fetch_hpo(hpo_terms_file): """Test fetch resource""" # GIVEN an URL url = "https://raw.githubusercontent.com/obophenotype/human-phenotype-ontology/master/hp.obo" with open(hpo_terms_file, "r") as hpo_file: content = hpo_file.read() responses.add( responses.GET, url, body=content, status=200, ) # WHEN fetching the resource data = scout_requests.fetch_hpo_terms() # THEN assert that the HPO header is there assert "format-version" in data[0]
def test_fetch_hpo(hpo_terms_file): """Test fetch resource""" # GIVEN an URL url = "http://purl.obolibrary.org/obo/hp.obo" with open(hpo_terms_file, "r") as hpo_file: content = hpo_file.read() responses.add( responses.GET, url, body=content, status=200, ) # WHEN fetching the resource data = scout_requests.fetch_hpo_terms() # THEN assert that the HPO header is there assert "format-version" in data[0]
def load_hpo( adapter, disease_lines=None, hpo_lines=None, hpo_gene_lines=None, ): """Load the hpo terms and hpo diseases into database Args: adapter(MongoAdapter) disease_lines(iterable(str)): These are the omim genemap2 information hpo_lines(iterable(str)): lines from file http://purl.obolibrary.org/obo/hp.obo hpo_gene_lines(iterable(str)): lines from file https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt """ # Create a map from gene aliases to gene objects alias_genes = adapter.genes_by_alias() # Fetch the hpo terms if no file if not hpo_lines: hpo_lines = fetch_hpo_terms() # Fetch the hpo gene information if no file if not hpo_gene_lines: hpo_gene_lines = fetch_hpo_to_genes_to_disease() load_hpo_terms(adapter, hpo_lines, hpo_gene_lines, alias_genes) if not disease_lines: LOG.warning("No omim information, skipping to load disease terms") return load_disease_terms( adapter=adapter, genemap_lines=disease_lines, genes=alias_genes, hpo_disease_lines=hpo_gene_lines, )
def load_hpo_terms(adapter, hpo_lines=None, hpo_gene_lines=None, alias_genes=None): """Load the hpo terms into the database Parse the hpo lines, build the objects and add them to the database Args: adapter(MongoAdapter) hpo_lines(iterable(str)): lines from file http://purl.obolibrary.org/obo/hp.obo hpo_gene_lines(iterable(str)): lines from file https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt alias_genes """ # Fetch the hpo terms if no file if not hpo_lines: hpo_lines = fetch_hpo_terms() # Parse the terms LOG.info("Parsing hpo terms") hpo_terms = build_hpo_tree(hpo_lines) # Fetch the hpo gene information if no file if not hpo_gene_lines: hpo_gene_lines = fetch_hpo_to_genes_to_disease() # Get a map with hgnc symbols to hgnc ids from scout if not alias_genes: alias_genes = adapter.genes_by_alias() LOG.info("Adding gene information to hpo terms ...") for hpo_to_symbol in parse_hpo_to_genes(hpo_gene_lines): hgnc_symbol = hpo_to_symbol["hgnc_symbol"] hpo_id = hpo_to_symbol["hpo_id"] # Fetch gene info to get correct hgnc id gene_info = alias_genes.get(hgnc_symbol) if not gene_info: continue hgnc_id = gene_info["true"] if hpo_id not in hpo_terms: continue hpo_term = hpo_terms[hpo_id] if not "genes" in hpo_term: hpo_term["genes"] = set() hpo_term["genes"].add(hgnc_id) start_time = datetime.now() LOG.info("Loading the hpo terms...") nr_terms = len(hpo_terms) hpo_bulk = [] with progressbar(hpo_terms.values(), label="Loading hpo terms", length=nr_terms) as bar: for hpo_info in bar: hpo_bulk.append(build_hpo_term(hpo_info)) if len(hpo_bulk) > 10000: adapter.load_hpo_bulk(hpo_bulk) hpo_bulk = [] if hpo_bulk: adapter.load_hpo_bulk(hpo_bulk) LOG.info("Loading done. Nr of terms loaded {0}".format(nr_terms)) LOG.info("Time to load terms: {0}".format(datetime.now() - start_time))
all_ancestors = get_all_ancestors(hpo_tree, term, set()) term["all_ancestors"] = all_ancestors return hpo_tree if __name__ == "__main__": import sys from pprint import pprint as pp from scout.utils.handle import get_file_handle from scout.utils.scout_requests import fetch_hpo_terms if not len(sys.argv) > 1: file_handle = fetch_hpo_terms() else: file_handle = get_file_handle(sys.argv[1]) hpo_tree = build_hpo_tree(file_handle) my_term = hpo_tree["HP:0200024"] pp(my_term) # print(get_all_ancestors(hpo_tree, my_term)) for term in hpo_tree: pp(hpo_tree[term]) # phenotypes = parse_hpo_phenotypes(file_handle) # for hpo_id in phenotypes: # hpo_term = phenotypes[hpo_id]