예제 #1
0
def print_hpo(out_dir):
    """Print HPO files to a directory

    Args:
        out_dir(Path)
    """
    hpo_file_name = "hpo.obo"
    hpo_file_path = out_dir / hpo_file_name
    LOG.info("Download HPO terms to %s", hpo_file_path)
    with hpo_file_path.open("w", encoding="utf-8") as outfile:
        for line in fetch_hpo_terms():
            outfile.write(line + "\n")

    hpo_file_name = "ALL_SOURCES_ALL_FREQUENCIES_genes_to_phenotype.txt"
    hpo_file_path = out_dir / hpo_file_name
    LOG.info("Download HPO genes to %s", hpo_file_path)
    with hpo_file_path.open("w", encoding="utf-8") as outfile:
        for line in fetch_hpo_genes():
            outfile.write(line + "\n")

    hpo_file_name = "ALL_SOURCES_ALL_FREQUENCIES_phenotype_to_genes.txt"
    hpo_file_path = out_dir / hpo_file_name
    LOG.info("Download HPO TO genes to %s", hpo_file_path)
    with hpo_file_path.open("w", encoding="utf-8") as outfile:
        for line in fetch_hpo_to_genes():
            outfile.write(line + "\n")

    hpo_file_name = "ALL_SOURCES_ALL_FREQUENCIES_diseases_to_genes_to_phenotypes.txt"
    hpo_file_path = out_dir / hpo_file_name
    LOG.info("Download HPO disease %s", hpo_file_path)
    with hpo_file_path.open("w", encoding="utf-8") as outfile:
        for line in fetch_hpo_phenotype_to_terms():
            outfile.write(line + "\n")
예제 #2
0
def get_reduced_hpo_terms(hpo_terms):
    """Return a reduced version of the hpo terms

    Args:
        hpo_terms(set(str)): Set of choosen terms that should be included

    Yields:
        hpo_line: A line with hpo information
    """
    hpo_lines = fetch_hpo_terms()

    begining = True

    term_lines = []
    # We want to keep the header lines
    keep = True

    nr_terms = 0
    nr_kept = 0

    for line in hpo_lines:

        # When we encounter a new term we yield all lines of the previous term
        if line.startswith("[Term]"):
            nr_terms += 1
            if keep:
                nr_kept += 1
                for hpo_line in term_lines:
                    yield hpo_line

            keep = False
            term_lines = []

        elif line.startswith("id"):
            hpo_id = line[4:]
            if hpo_id in hpo_terms:
                keep = True

        term_lines.append(line)

    if keep:
        for hpo_line in term_lines:
            yield hpo_line

    LOG.info("Nr of terms in file %s", nr_terms)
    LOG.info("Nr of terms kept: %s", nr_kept)
예제 #3
0
def test_fetch_hpo(hpo_terms_file):
    """Test fetch resource"""

    # GIVEN an URL
    url = "https://raw.githubusercontent.com/obophenotype/human-phenotype-ontology/master/hp.obo"
    with open(hpo_terms_file, "r") as hpo_file:
        content = hpo_file.read()
    responses.add(
        responses.GET,
        url,
        body=content,
        status=200,
    )

    # WHEN fetching the resource
    data = scout_requests.fetch_hpo_terms()

    # THEN assert that the HPO header is there
    assert "format-version" in data[0]
예제 #4
0
def test_fetch_hpo(hpo_terms_file):
    """Test fetch resource"""

    # GIVEN an URL
    url = "http://purl.obolibrary.org/obo/hp.obo"
    with open(hpo_terms_file, "r") as hpo_file:
        content = hpo_file.read()
    responses.add(
        responses.GET,
        url,
        body=content,
        status=200,
    )

    # WHEN fetching the resource
    data = scout_requests.fetch_hpo_terms()

    # THEN assert that the HPO header is there
    assert "format-version" in data[0]
예제 #5
0
def load_hpo(
    adapter,
    disease_lines=None,
    hpo_lines=None,
    hpo_gene_lines=None,
):
    """Load the hpo terms and hpo diseases into database

    Args:
        adapter(MongoAdapter)
        disease_lines(iterable(str)): These are the omim genemap2 information
        hpo_lines(iterable(str)): lines from file http://purl.obolibrary.org/obo/hp.obo
        hpo_gene_lines(iterable(str)): lines from file
            https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt

    """
    # Create a map from gene aliases to gene objects
    alias_genes = adapter.genes_by_alias()

    # Fetch the hpo terms if no file
    if not hpo_lines:
        hpo_lines = fetch_hpo_terms()

    # Fetch the hpo gene information if no file
    if not hpo_gene_lines:
        hpo_gene_lines = fetch_hpo_to_genes_to_disease()

    load_hpo_terms(adapter, hpo_lines, hpo_gene_lines, alias_genes)

    if not disease_lines:
        LOG.warning("No omim information, skipping to load disease terms")
        return

    load_disease_terms(
        adapter=adapter,
        genemap_lines=disease_lines,
        genes=alias_genes,
        hpo_disease_lines=hpo_gene_lines,
    )
예제 #6
0
def load_hpo_terms(adapter,
                   hpo_lines=None,
                   hpo_gene_lines=None,
                   alias_genes=None):
    """Load the hpo terms into the database

    Parse the hpo lines, build the objects and add them to the database

    Args:
        adapter(MongoAdapter)
        hpo_lines(iterable(str)): lines from file http://purl.obolibrary.org/obo/hp.obo
        hpo_gene_lines(iterable(str)): lines from file
            https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt
        alias_genes
    """
    # Fetch the hpo terms if no file
    if not hpo_lines:
        hpo_lines = fetch_hpo_terms()

    # Parse the terms
    LOG.info("Parsing hpo terms")
    hpo_terms = build_hpo_tree(hpo_lines)

    # Fetch the hpo gene information if no file
    if not hpo_gene_lines:
        hpo_gene_lines = fetch_hpo_to_genes_to_disease()

    # Get a map with hgnc symbols to hgnc ids from scout
    if not alias_genes:
        alias_genes = adapter.genes_by_alias()

    LOG.info("Adding gene information to hpo terms ...")
    for hpo_to_symbol in parse_hpo_to_genes(hpo_gene_lines):
        hgnc_symbol = hpo_to_symbol["hgnc_symbol"]
        hpo_id = hpo_to_symbol["hpo_id"]

        # Fetch gene info to get correct hgnc id
        gene_info = alias_genes.get(hgnc_symbol)
        if not gene_info:
            continue

        hgnc_id = gene_info["true"]

        if hpo_id not in hpo_terms:
            continue

        hpo_term = hpo_terms[hpo_id]

        if not "genes" in hpo_term:
            hpo_term["genes"] = set()

        hpo_term["genes"].add(hgnc_id)

    start_time = datetime.now()

    LOG.info("Loading the hpo terms...")
    nr_terms = len(hpo_terms)
    hpo_bulk = []
    with progressbar(hpo_terms.values(),
                     label="Loading hpo terms",
                     length=nr_terms) as bar:

        for hpo_info in bar:
            hpo_bulk.append(build_hpo_term(hpo_info))

        if len(hpo_bulk) > 10000:
            adapter.load_hpo_bulk(hpo_bulk)
            hpo_bulk = []

    if hpo_bulk:
        adapter.load_hpo_bulk(hpo_bulk)

    LOG.info("Loading done. Nr of terms loaded {0}".format(nr_terms))
    LOG.info("Time to load terms: {0}".format(datetime.now() - start_time))
예제 #7
0
        all_ancestors = get_all_ancestors(hpo_tree, term, set())

        term["all_ancestors"] = all_ancestors

    return hpo_tree


if __name__ == "__main__":
    import sys
    from pprint import pprint as pp
    from scout.utils.handle import get_file_handle
    from scout.utils.scout_requests import fetch_hpo_terms

    if not len(sys.argv) > 1:
        file_handle = fetch_hpo_terms()
    else:
        file_handle = get_file_handle(sys.argv[1])

    hpo_tree = build_hpo_tree(file_handle)

    my_term = hpo_tree["HP:0200024"]
    pp(my_term)

    # print(get_all_ancestors(hpo_tree, my_term))
    for term in hpo_tree:
        pp(hpo_tree[term])

    # phenotypes = parse_hpo_phenotypes(file_handle)
    # for hpo_id in phenotypes:
    #     hpo_term = phenotypes[hpo_id]