Exemplo n.º 1
0
def hpo(out_dir, terms, genes, disease):
    """Download all files necessary for HPO

    If terms or genes or disease is used print this to terminal
    """

    kwargs = {
        "genes_to_phenotype": True,
        "phenotype_to_genes": True,
        "hpo_terms": True
    }
    if terms or genes or disease:
        kwargs = {
            "genes_to_phenotype": genes,
            "phenotype_to_genes": disease,
            "hpo_terms": terms,
        }
        hpo_info = fetch_hpo_files(**kwargs)
        if terms:
            info = hpo_info["hpo_terms"]
        elif genes:
            info = hpo_info["genes_to_phenotype"]
        else:
            info = hpo_info["phenotype_to_genes"]
        for line in info:
            click.echo(line)
        return

    hpo_info = fetch_hpo_files(**kwargs)
    out_dir = pathlib.Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    LOG.info("Download HPO resources to %s", out_dir)

    print_hpo(out_dir, hpo_info)
Exemplo n.º 2
0
def test_fetch_hpo_files(phenotype_to_genes_file, hpo_genes_file):
    """Test fetch hpo files"""

    # GIVEN URLs two hpo files
    url_1 = scout_requests.HPO_URL.format("phenotype_to_genes.txt")
    url_2 = scout_requests.HPO_URL.format("genes_to_phenotype.txt")

    with open(phenotype_to_genes_file, "r") as hpo_file:
        content = hpo_file.read()

    responses.add(
        responses.GET,
        url_1,
        body=content,
        status=200,
    )

    with open(hpo_genes_file, "r") as hpo_file:
        content = hpo_file.read()

    responses.add(
        responses.GET,
        url_2,
        body=content,
        status=200,
    )

    # WHEN fetching all hpo files
    res = scout_requests.fetch_hpo_files(genes_to_phenotype=True,
                                         phenotype_to_genes=True)

    # THEN assert that the HPO header is there
    assert isinstance(res, dict)
Exemplo n.º 3
0
def generate_hpo_files(genes):
    """Generate files with hpo reduced information"""
    hpo_files = fetch_hpo_files(hpogenes=True,
                                hpoterms=True,
                                phenotype_to_terms=True,
                                hpodisease=False)

    file_names = {
        "hpogenes": hpogenes_reduced_path,
        "hpoterms": hpoterms_reduced_path,
        "phenotype_to_terms": hpo_phenotype_to_terms_reduced_path,
    }

    for name in file_names:
        hpo_lines = hpo_files[name]
        out_path = file_names[name]
        outfile = open(out_path, "w")
        LOG.info("Writing file %s", out_path)

        for i, line in enumerate(hpo_lines):
            line = line.rstrip()
            if not len(line) > 1:
                continue
            if i == 0:  # Header line
                outfile.write(line + "\n")
                continue
            splitted_line = line.split("\t")
            if name == "hpogenes":
                hgnc_symbol = splitted_line[1]
            elif name == "hpoterms":
                hgnc_symbol = splitted_line[3]
            elif name == "phenotype_to_terms":
                hgnc_symbol = splitted_line[1]

            if hgnc_symbol in genes:
                outfile.write(line + "\n")
        LOG.info("File ready")
Exemplo n.º 4
0
def load_hgnc_genes(
    adapter,
    genes=None,
    ensembl_lines=None,
    hgnc_lines=None,
    exac_lines=None,
    mim2gene_lines=None,
    genemap_lines=None,
    hpo_lines=None,
    build="37",
    omim_api_key="",
):
    """Load genes into the database

    link_genes will collect information from all the different sources and
    merge it into a dictionary with hgnc_id as key and gene information as values.

    Args:
        adapter(scout.adapter.MongoAdapter)
        genes(dict): If genes are already parsed
        ensembl_lines(iterable(str)): Lines formated with ensembl gene information
        hgnc_lines(iterable(str)): Lines with gene information from genenames.org
        exac_lines(iterable(str)): Lines with information pLi-scores from ExAC
        mim2gene(iterable(str)): Lines with map from omim id to gene symbol
        genemap_lines(iterable(str)): Lines with information of omim entries
        hpo_lines(iterable(str)): Lines information about map from hpo terms to genes
        build(str): What build to use. Defaults to '37'

    Returns:
        gene_objects(list): A list with all gene_objects that was loaded into database
    """
    gene_objects = list()

    if not genes:
        # Fetch the resources if not provided
        if ensembl_lines is None:
            ensembl_lines = fetch_ensembl_genes(build=build)
        hgnc_lines = hgnc_lines or fetch_hgnc()
        exac_lines = exac_lines or fetch_exac_constraint()
        if not (mim2gene_lines and genemap_lines):
            if not omim_api_key:
                LOG.warning("No omim api key provided!")
            else:
                mim_files = fetch_mim_files(omim_api_key,
                                            mim2genes=True,
                                            genemap2=True)
                mim2gene_lines = mim_files["mim2genes"]
                genemap_lines = mim_files["genemap2"]
        if not hpo_lines:
            hpo_files = fetch_hpo_files(hpogenes=True)
            hpo_lines = hpo_files["hpogenes"]

        # Link the resources
        genes = link_genes(
            ensembl_lines=ensembl_lines,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            hpo_lines=hpo_lines,
            mim2gene_lines=mim2gene_lines,
            genemap_lines=genemap_lines,
        )

    non_existing = 0
    nr_genes = len(genes)

    with progressbar(genes.values(), label="Building genes",
                     length=nr_genes) as bar:
        for gene_data in bar:
            if not gene_data.get("chromosome"):
                LOG.debug(
                    "skipping gene: %s. No coordinates found",
                    gene_data.get("hgnc_symbol", "?"),
                )
                non_existing += 1
                continue

            gene_obj = build_hgnc_gene(gene_data, build=build)
            gene_objects.append(gene_obj)

    LOG.info("Loading genes build %s", build)
    adapter.load_hgnc_bulk(gene_objects)

    LOG.info("Loading done. %s genes loaded", len(gene_objects))
    LOG.info("Nr of genes without coordinates in build %s: %s", build,
             non_existing)

    return gene_objects